In [1]:
import unicodecsv

In [7]:
with open("beatles-diskography.csv", "rb") as f:
    reader = unicodecsv.DictReader(f)
    beatles_csv = list(reader)

## Reading CSV files without CSV reader

In [69]:


# Your task is to read the input DATAFILE line by line, and for the first 10 lines (not including the header)
# split each line on "," and then for each line, create a dictionary
# where the key is the header title of the field, and the value is the value of that field in the row.
# The function parse_file should return a list of dictionaries,
# each data line in the file being a single list entry.
# Field names and values should not contain extra whitespace, like spaces or newline characters.
# You can use the Python string method strip() to remove the extra whitespace.
# You have to parse only the first 10 data lines in this exercise,
# so the returned list should have 10 entries!
import os

DATADIR = ""
DATAFILE = "beatles-diskography.csv"


def parse_file(datafile):
    data = []
    with open(datafile, "r") as f:
        keys = []
        for number, line in enumerate(f):
            if number == 0:
                keys.extend(line.split(","))
            else:
                values = line.split(",")
                zip_values = zip(keys, values)
                diccio = dict((key.strip(), value.strip()) for key, value in zip_values)
                data.append(diccio)
            if number>= 10:
                break

    return data

## Reading excel files with xlrd module

In [116]:
#!/usr/bin/env python
"""
Your task is as follows:
- read the provided Excel file
- find and return the min and max values for the COAST region
- find and return the time value for the min and max entries
- the time values should be returned as Python tuples
Please see the test function for the expected return format
"""

import xlrd
from zipfile import ZipFile
datafile = "2013_ERCOT_Hourly_Load_Data.xls"


#def open_zip(datafile):
#    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
#        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    ###example on how you can get the data
    sheet_data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]
    ### other useful methods:
    #print "\nROWS, COLUMNS, and CELLS:"
    #print "Number of rows in the sheet:", 
    #print sheet.nrows
    #print "Type of data in cell (row 3, col 2):", 
    #print sheet.cell_type(3, 2)
    #print "Value in cell (row 3, col 2):", 
    #print sheet.cell_value(3, 2)
    #print "Get a slice of values in column 3, from rows 1-3:"
    #print sheet.col_values(3, start_rowx=1, end_rowx=4)

    #print "\nDATES:"
    #print "Type of data in cell (row 1, col 0):", 
    #print sheet.cell_type(1, 0)
    #exceltime = sheet.cell_value(1, 0)
    #print "Time in Excel format:",
    #print exceltime
    #print "Convert time to a Python datetime tuple, from the Excel float:",
    #print xlrd.xldate_as_tuple(exceltime, 0)
    
    coast_col = sheet.col_values(1, start_rowx=1)
    average_coast = sum(coast_col) / float(len(coast_col))
    maxvalue = max(coast_col)
    minvalue = min(coast_col)

#Note that here, I could have used the index function on lists to retrieve the index that I was looking for.
    def search_row_val(value):
        for number, row in enumerate(coast_col):
            if row == value:
                return number + 1
            
    max_row_value = search_row_val(maxvalue)
    min_row_value = search_row_val(minvalue)
    max_time = sheet.cell_value(max_row_value, 0)
    min_time = sheet.cell_value(min_row_value, 0)
    max_time_tuple = xlrd.xldate_as_tuple(max_time , 0)
    min_time_tuple = xlrd.xldate_as_tuple(min_time , 0)

    
    data = {
            'maxtime': max_time_tuple,
            'maxvalue': maxvalue,
            'mintime': min_time_tuple,
            'minvalue': minvalue,
            'avgcoast': average_coast
    }
    return data


def test():
    #open_zip(datafile)
    data = parse_file(datafile)

    assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
    assert round(data['maxvalue'], 10) == round(18779.02551, 10)

test()

## Using JSON and retrieving info from API

In [123]:
# To experiment with this code freely you will have to run this code locally.
# Take a look at the main() function for an example of how to use the code.
# We have provided example json output in the other code editor tabs for you to
# look at, but you will not be able to run any queries through our UI.
import json
import requests


BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"

# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    # This is the main function for making queries to the musicbrainz API.
    # A json document should be returned by the query.
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print "requesting", r.url

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    # This adds an artist name to the query parameters before making
    # an API call to the function above.
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    # After we get our output, we can format it to be more readable
    # by using this function.
    if type(data) == dict:
        print json.dumps(data, indent=indent, sort_keys=True)
    else:
        print data


def main():
    '''
    Modify the function calls and indexing below to answer the questions on
    the next quiz. HINT: Note how the output we get from the site is a
    multi-level JSON document, so try making print statements to step through
    the structure one level at a time or copy the output to a separate output
    file.
    '''
    results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
    #pretty_print(results)

    artist_id = results["artists"][1]["id"]
    print artist_id
    #print "\nARTIST:"
    #pretty_print(results["artists"][1])
#
    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    #releases = artist_data["releases"]
    #print "\nONE RELEASE:"
    #pretty_print(releases[0], indent=2)
    #release_titles = [r["title"] for r in releases]
#
    #print "\nALL TITLES:"
    #for t in release_titles:
    #    print t


if __name__ == '__main__':
    main()

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json
9282c8b4-ca0b-4c6b-b7e3-4f7762dfc4d6


In [160]:
# Question 1
results_first_aid = query_by_name(ARTIST_URL, query_type["simple"], "First Aid Kit")
counter = 0
for item in results_first_aid["artists"]:
    if item["name"].lower() == "first aid kit":
        counter += 1
print counter
#pretty_print(results_first_aid)


requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AFirst+Aid+Kit&fmt=json
2


In [142]:
# Question 2
results_queen = query_by_name(ARTIST_URL, query_type["simple"], "Queen")
pretty_print(results_queen["artists"][0]["begin-area"])

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AQueen&fmt=json
{
    "id": "f03d09b3-39dc-4083-afd6-159e3f0d462f", 
    "name": "London", 
    "sort-name": "London"
}


In [143]:
# Question 3
results_beatles = query_by_name(ARTIST_URL, query_type["simple"], "Beatles")

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ABeatles&fmt=json


In [148]:
#pretty_print(results_beatles["artists"][0]["aliases"])

In [150]:
# Question 4
results_nirvana = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
pretty_print(results_nirvana["artists"][0]["disambiguation"])

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json
90s US grunge band


In [152]:
# Qestion 5
results_one_d = query_by_name(ARTIST_URL, query_type["simple"], "One direction")
pretty_print(results_one_d["artists"][0])

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AOne+direction&fmt=json
{
    "area": {
        "id": "8a754a16-0027-3a29-b6d7-2b40ea0481ed", 
        "name": "United Kingdom", 
        "sort-name": "United Kingdom"
    }, 
    "begin-area": {
        "id": "f03d09b3-39dc-4083-afd6-159e3f0d462f", 
        "name": "London", 
        "sort-name": "London"
    }, 
    "country": "GB", 
    "disambiguation": "English-Irish boy band formed in 2010", 
    "id": "1a425bbd-cca4-4b2c-aeb7-71cb176c828a", 
    "life-span": {
        "begin": "2010-07", 
        "ended": null
    }, 
    "name": "One Direction", 
    "score": "100", 
    "sort-name": "One Direction", 
    "tags": [
        {
            "count": 2, 
            "name": "pop"
        }, 
        {
            "count": 1, 
            "name": "power pop"
        }, 
        {
            "count": 1, 
            "name": "dance-pop"
        }, 
        {
            "count": 1, 
            "name": "pop rock"
        }, 

## Task: Reading from csv files

In [225]:
"""
Your task is to process the supplied file and use the csv module to extract data from it.
The data comes from NREL (National Renewable Energy Laboratory) website. Each file
contains information from one meteorological station, in particular - about amount of
solar and wind energy for each hour of day.

Note that the first line of the datafile is neither data entry, nor header. It is a line
describing the data source. You should extract the name of the station from it.

The data should be returned as a list of lists (not dictionaries).
You can use the csv modules "reader" method to get data in such format.
Another useful method is next() - to get the next line from the iterator.
You should only change the parse_file function.
"""
import csv
import os

DATADIR = "/Users/Jose/Documents/Udacity/data_science/data_wrangling"
DATAFILE = "745090.csv"


def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'rb') as f:
        name = f.readline().split(",")[1][1:-1]
        reader = csv.reader(f)
        next(reader)
        for line in reader:
            data.append(line)
    # Do not change the line below
    return (name, data)


def test():
    datafile = os.path.join(DATADIR, DATAFILE)

    name, data = parse_file(datafile)
    assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
    assert data[0][1] == "01:00"
    assert data[2][0] == "01/01/2005"
    assert data[2][5] == "2"


if __name__ == "__main__":
    test()

## Excel Exercise

In [324]:
# -*- coding: utf-8 -*-
'''
Find the time and value of max load for each of the regions
COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST
and write the result out in a csv file, using pipe character | as the delimiter.

An example output can be seen in the "example.csv" file.
'''

import xlrd
import os
import csv
from zipfile import ZipFile

datafile = "2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"


#def open_zip(datafile):
#    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
#        myzip.extractall()
#

def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    #data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]
    # YOUR CODE HERE
    # Remember that you can use xlrd.xldate_as_tuple(sometime, 0) to convert
    # Excel date to Python tuple of (year, month, day, hour, minute, second)
    result_list = []
    for item in range(1, len(['COAST', 'EAST', 'FAR_WEST', 'NORTH',
                           'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']) + 1):
        region_name = sheet.col_values(item, start_rowx=0, end_rowx=1)[0].strip()
        region = sheet.col_values(item, start_rowx=1)
        maxvalue = max(region)
        ind_max = region.index(maxvalue) + 1
        max_time = sheet.cell_value(ind_max, 0)
        max_time_tuple = xlrd.xldate_as_tuple(max_time , 0)
        dict_results = {"Station": region_name, "Year": max_time_tuple[0], "Month": max_time_tuple[1], \
                       "Day": max_time_tuple[2], "Hour": max_time_tuple[3], "Max Load": maxvalue}
        result_list.append(dict_results)
    return result_list



def save_file(data, filename):
    with open(filename, 'wb') as f:  # Just use 'w' mode in 3.x
        writer = csv.DictWriter(f, data[0].keys(), delimiter="|")
        writer.writeheader()
        writer.writerows(data)

    
def test():
    #open_zip(datafile)
    data = parse_file(datafile)
    save_file(data, outfile)

    number_of_rows = 0
    stations = []

    ans = {'FAR_WEST': {'Max Load': '2281.2722140000024',
                        'Year': '2013',
                        'Month': '6',
                        'Day': '26',
                        'Hour': '17'}}
    correct_stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH',
                        'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']
    fields = ['Year', 'Month', 'Day', 'Hour', 'Max Load']

    with open(outfile) as of:
        csvfile = csv.DictReader(of, delimiter="|")
        for line in csvfile:
            station = line['Station']
            if station == 'FAR_WEST':
                for field in fields:
                    # Check if 'Max Load' is within .1 of answer
                    if field == 'Max Load':
                        max_answer = round(float(ans[station][field]), 1)
                        max_line = round(float(line[field]), 1)
                        assert max_answer == max_line

                    # Otherwise check for equality
                    else:
                        assert ans[station][field] == line[field]

            number_of_rows += 1
            stations.append(station)

        # Output should be 8 lines not including header
        assert number_of_rows == 8

        # Check Station Names
        assert set(stations) == set(correct_stations)

        
if __name__ == "__main__":
    test()


## Working with JSON files

In [326]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This exercise shows some important concepts that you should be aware about:
- using codecs module to write unicode files
- using authentication with web APIs
- using offset when accessing web APIs

To run this code locally you have to register at the NYTimes developer site 
and get your own API key. You will be able to complete this exercise in our UI
without doing so, as we have provided a sample result.

Your task is to process the saved file that represents the most popular
articles (by view count) from the last day, and return the following data:
- list of dictionaries, where the dictionary key is "section" and value is "title"
- list of URLs for all media entries with "format": "Standard Thumbnail"

All your changes should be in the article_overview function.
The rest of functions are provided for your convenience, if you want to access
the API by yourself.
"""
import json
import codecs
import requests

URL_MAIN = "http://api.nytimes.com/svc/"
URL_POPULAR = URL_MAIN + "mostpopular/v2/"
API_KEY = { "popular": "6b1b8aa9f3914e24abc3f1bfe52aebbf",
            "article": "6b1b8aa9f3914e24abc3f1bfe52aebbf"}


def get_from_file(kind, period):
    filename = "popular-{0}-{1}.json".format(kind, period)
    with open(filename, "r") as f:
        return json.loads(f.read())


#def article_overview(kind, period):
#    data = get_from_file(kind, period)
#    titles = []
#    urls =[]
#    # YOUR CODE HERE
#
#    return (titles, urls)
#

def query_site(url, target, offset):
    # This will set up the query with the API key and offset
    # Web services often use offset paramter to return data in small chunks
    # NYTimes returns 20 articles per request, if you want the next 20
    # You have to provide the offset parameter
    if API_KEY["popular"] == "" or API_KEY["article"] == "":
        print "You need to register for NYTimes Developer account to run this program."
        print "See Intructor notes for information"
        return False
    params = {"api-key": API_KEY[target], "offset": offset}
    r = requests.get(url, params = params)

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def get_popular(url, kind, days, section="all-sections", offset=0):
    # This function will construct the query according to the requirements of the site
    # and return the data, or print an error message if called incorrectly
    if days not in [1,7,30]:
        print "Time period can be 1,7, 30 days only"
        return False
    if kind not in ["viewed", "shared", "emailed"]:
        print "kind can be only one of viewed/shared/emailed"
        return False

    url += "most{0}/{1}/{2}.json".format(kind, section, days)
    data = query_site(url, "popular", offset)

    return data


def save_file(kind, period):
    # This will process all results, by calling the API repeatedly with supplied offset value,
    # combine the data and then write all results in a file.
    data = get_popular(URL_POPULAR, "viewed", 1)
    num_results = data["num_results"]
    full_data = []
    with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v:
        for offset in range(0, num_results, 20):        
            data = get_popular(URL_POPULAR, kind, period, offset=offset)
            full_data += data["results"]
        
        v.write(json.dumps(full_data, indent=2))


#def test():
#    titles, urls = article_overview("viewed", 1)
#    assert len(titles) == 20
#    assert len(urls) == 30
#    assert titles[2] == {'Opinion': 'Professors, We Need You!'}
#    assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg'
#
#
#if __name__ == "__main__":
#    test()

In [334]:
save_file("viewed", 1)

In [335]:
data = get_from_file("viewed", 1)

In [379]:
data[0]["media"][0]


{u'caption': u'Matt Lauer with Hillary Clinton during Wednesday\u2019s forum at the Intrepid Sea, Air &amp; Space Museum in Manhattan.',
 u'copyright': u'Doug Mills/The New York Times',
 u'media-metadata': [{u'format': u'square320',
   u'height': 320,
   u'url': u'https://static01.nyt.com/images/2016/09/08/us/08ASSESSweb1/08ASSESSweb1-square320-v2.jpg',
   u'width': 320},
  {u'format': u'Standard Thumbnail',
   u'height': 75,
   u'url': u'https://static01.nyt.com/images/2016/09/08/us/08ASSESSweb1/08ASSESSweb1-thumbStandard-v2.jpg',
   u'width': 75},
  {u'format': u'Normal',
   u'height': 133,
   u'url': u'https://static01.nyt.com/images/2016/09/08/us/08ASSESSweb1/08ASSESSweb1-articleInline.jpg',
   u'width': 190},
  {u'format': u'Large',
   u'height': 264,
   u'url': u'https://static01.nyt.com/images/2016/09/08/us/08ASSESSweb1/08ASSESSweb1-sfSpan-v2.jpg',
   u'width': 395},
  {u'format': u'Jumbo',
   u'height': 715,
   u'url': u'https://static01.nyt.com/images/2016/09/08/us/08ASSESSweb

In [382]:
for num, article in enumerate(data):
    for  entry in article["media"][0]["media-metadata"]:
        if entry["format"] == 'Standard Thumbnail':
            print "yes"
            print num

yes
0
yes
1
yes
2
yes
3
yes
4
yes
5
yes
6
yes
7
yes
8
yes
9
yes
10
yes
11
yes
12
yes
13
yes
14
yes
15
yes
16
yes
17
yes
18
yes
19
yes
20
yes
21
yes
22
yes
23
yes
24
yes
25
yes
26
yes
27
yes
28
yes
29
yes
30
yes
31
yes
32
yes
33
yes
34
yes
35
yes
36
yes
37


IndexError: string index out of range

In [340]:
section_title = []
for article in data:
    dict_data = {article["section"]: article["title"]}
    section_title.append(dict_data)

In [341]:
print section_title



In [384]:
data[38]["media"][0]["media-metadata"]

IndexError: string index out of range

In [385]:
data[38]

{u'abstract': u'In a decision that could fundamentally reshape public education in the state, a judge on Wednesday ordered changes in everything from how schools are financed to how teachers are paid and evaluated.',
 u'adx_keywords': u'Connecticut;Decisions and Verdicts;Education (K-12);Moukawsher, Thomas G (1962- );Teachers and School Employees;Suits and Litigation (Civil);Budgets and Budgeting',
 u'asset_id': 100000004632450,
 u'byline': u'By ELIZABETH A. HARRIS',
 u'column': u'',
 u'des_facet': [u'DECISIONS AND VERDICTS',
  u'EDUCATION (K-12)',
  u'BUDGETS AND BUDGETING'],
 u'geo_facet': [u'CONNECTICUT'],
 u'id': 100000004632450,
 u'media': u'',
 u'org_facet': u'',
 u'per_facet': [u'MOUKAWSHER, THOMAS G (1962- )'],
 u'published_date': u'2016-09-08',
 u'section': u'N.Y. / Region',
 u'source': u'The New York Times',
 u'title': u'Judge, Citing Inequality, Orders Connecticut to Overhaul Its School System',
 u'type': u'Article',
 u'url': u'http://www.nytimes.com/2016/09/08/nyregion/conn

In [388]:
round(0.2, 0)

0.0