# Wrangling JSON

## An example to show how to get data from NYT using python.
First an API key from NYT developer site https://developer.nytimes.com is required to access the data.

In [1]:
# Import the required libs
import json
import requests
from pprint import pprint

# Query the database.
query = 'new york times'
query2 = 'Presidential'

# From NYT developer site request an API key.
# There are many kinds of API’s available.
# Use the url, query and API key to access the NYT data.
url = 'http://api.nytimes.com/svc/search/v2/articlesearch.json? + query + &page=2&sort=oldest&\
api-key=7b6be480b8ef4f6f91996a4af09491ca'

# Store data in a local variable called resp. This gives everything in the form of a continuous string.
resp = requests.get(url)

# Change the above string into json format using loads function.
data = json.loads(resp.text)

Extract all the data or one by one or using specific keys.

In [2]:
# Turn from False to True to run the code.
# Get a better formatted form of data using pretty print.
if False:
    pprint(data)

# Extract data
if False:
    for key in data:
        print(key)

# Get the data from each of these three keys. Example:
if False:
    pprint(data['response'])

# See the keys inside response.
if False:
    for key in data['response']:
        pprint(key)

# Get the data from docs.
if False:
    pprint(data['response']['docs'])
    
# Print the first element of docs array.
if False:
    pprint(data['response']['docs'][0])

# Print the headlines of all the articles.
if False:
    for key in data['response']['docs']:
        pprint(key['headline']['main'])

More data wrangling

In [3]:
url2 = 'http://api.nytimes.com/svc/mostpopular/v2/mostviewed/arts/30.json?offset=40&api-key=7b6be480b8ef4f6f91996a4af09491ca'
resp2 = requests.get(url2)
data2 = json.loads(resp2.text)

# Make a function to convert the data to a list of dicts.
def article_overview():
    # Data consists of a list of dicts.
    titles = []
    urls =[]
    # Loop through each dict (ie, each article) within data.
    for article in data2['results']:
        # Extract section for each article.
        section = article['section']
        # Extract title for each article.
        title = article['title']
        # For each article, create a dict with section as key and title as value.
        titles.append({section: title})
        # Check to see if article has a "media" key.
        if 'media' in article:
            # Loop through media, which is a list of dicts.  
            # Each dict has 5 keys: "media-metadata", "subtype", "type", "copyright", and "caption".
            # Each m is one dict.
            for m in article['media']:
                # Loop through media-metadata, which is a list of dicts.
                # Each dict has 4 keys: "url", "width", "height", and "format".
                # Each mm is one dict.
                for mm in m['media-metadata']:
                    if mm['format'] == 'Standard Thumbnail':
                        urls.append(mm['url'])
    return (titles, urls)

if False:
    pprint(data2)

if False:
    for key in data2:
        print(key)
    
if False:
    # Call the article_overview function and print the extracted data.
    pprint(article_overview())