In [13]:
import numpy as np
import pandas as pd
# The requests library simplifies the process of making http requests
import requests
import os

# Exercise Instructions 

The end result of this exercise should be a file named acquire.py.

1. Using the code from the lesson as a guide and the REST API from https://python.zgulde.net/api/v1/items as we did in the lesson, create a dataframe named items that has all of the data for items.

2. Do the same thing, but for stores (https://python.zgulde.net/api/v1/stores)

3. Extract the data for sales (https://python.zgulde.net/api/v1/sales). There are a lot of pages of data here, so your code will need to be a little more complex. Your code should continue fetching data from the next page until all of the data is extracted.

4. Save the data in your files to local csv files so that it will be faster to access in the future.

5. Combine the data from your three separate dataframes into one large dataframe.

6. Acquire the Open Power Systems Data for Germany, which has been rapidly expanding its renewable energy production in recent years. The data set includes country-wide totals of electricity consumption, wind power production, and solar power production for 2006-2017. You can get the data here: https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv

7. Make sure all the work that you have done above is reproducible. That is, you should put the code above into separate functions in the acquire.py file and be able to re-run the functions and get the same data.

## Exercise 1

In [45]:
## Intial Set-Up
domain = 'https://python.zgulde.net'
endpoint = '/api/v1/items'
items = []

# For each page -- until next page is None
url = domain + endpoint
response = requests.get(url)
data = response.json()
items.extend(data['payload']['items'])
# update the endpoint
endpoint = data['payload']['next_page']

In [46]:
data['payload'].keys()

dict_keys(['items', 'max_page', 'next_page', 'page', 'previous_page'])

In [47]:
# For each page -- until next page is None
url = domain + endpoint
response = requests.get(url)
data = response.json()
items.extend(data['payload']['items'])
# update the endpoint
endpoint = data['payload']['next_page']

In [48]:
# For each page -- until next page is None
url = domain + endpoint
response = requests.get(url)
data = response.json()
items.extend(data['payload']['items'])
# update the endpoint
endpoint = data['payload']['next_page']

In [49]:
type(endpoint)

NoneType

In [50]:
data['payload']['max_page']

3

In [7]:
df = pd.DataFrame(items)
df

Unnamed: 0,item_brand,item_id,item_name,item_price,item_upc12,item_upc14
0,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
1,Caress,2,Caress Velvet Bliss Ultra Silkening Beauty Bar...,6.44,11111065925,11111065925
2,Earths Best,3,Earths Best Organic Fruit Yogurt Smoothie Mixe...,2.43,23923330139,23923330139
3,Boars Head,4,Boars Head Sliced White American Cheese - 120 Ct,3.14,208528800007,208528800007
4,Back To Nature,5,Back To Nature Gluten Free White Cheddar Rice ...,2.61,759283100036,759283100036
5,Sally Hansen,6,Sally Hansen Nail Color Magnetic 903 Silver El...,6.93,74170388732,74170388732
6,Twinings Of London,7,Twinings Of London Classics Lady Grey Tea - 20 Ct,9.64,70177154004,70177154004
7,Lea & Perrins,8,Lea & Perrins Marinade In-a-bag Cracked Pepper...,1.68,51600080015,51600080015
8,Van De Kamps,9,Van De Kamps Fillets Beer Battered - 10 Ct,1.79,19600923015,19600923015
9,Ahold,10,Ahold Cocoa Almonds,3.17,688267141676,688267141676


In [66]:
# Create a function that replicates this

def get_items(use_cache = True):
    '''
    This function takes in no arguments. It firsts checks if 'items.csv' exists, and if does, it returns a dataframe
    using this file. If the file does not exist it gathers the data using an API, creates a dataframe and caches it as 
    as a .csv file, then returns the dataframe.
    '''
    filename = 'items.csv'

    #Check for the csv cache
    if os.path.isfile(filename) and use_cache:
        print('Using cached csv...')
        return pd.read_csv(filename)

    else:
        #Gather data from the first page
        print('Gathering data using an API...')
        items = []
        domain = 'https://python.zgulde.net'
        endpoint = '/api/v1/items'
        url = domain + endpoint
        response = requests.get(url)
        data = response.json()
        number_of_pages = data['payload']['max_page']
        # Create a loop that gathers the data from each page and adds it to a dataframe until there are no pages left.
        for page in range(1, number_of_pages):
            url = domain + endpoint
            response = requests.get(url)
            data = response.json()
            items = items.extend(data['payload']['items'])
            # update the endpoint
            endpoint = data['payload']['next_page'] 

        # Now cache the dataframe as a .csv
        df = pd.DataFrame(items)
        df.to_csv('items.csv', index = False)

        return df



In [67]:
# Test function
get_items()

Using cached csv...


Unnamed: 0,item_brand,item_id,item_name,item_price,item_upc12,item_upc14
0,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
1,Caress,2,Caress Velvet Bliss Ultra Silkening Beauty Bar...,6.44,11111065925,11111065925
2,Earths Best,3,Earths Best Organic Fruit Yogurt Smoothie Mixe...,2.43,23923330139,23923330139
3,Boars Head,4,Boars Head Sliced White American Cheese - 120 Ct,3.14,208528800007,208528800007
4,Back To Nature,5,Back To Nature Gluten Free White Cheddar Rice ...,2.61,759283100036,759283100036
5,Sally Hansen,6,Sally Hansen Nail Color Magnetic 903 Silver El...,6.93,74170388732,74170388732
6,Twinings Of London,7,Twinings Of London Classics Lady Grey Tea - 20 Ct,9.64,70177154004,70177154004
7,Lea & Perrins,8,Lea & Perrins Marinade In-a-bag Cracked Pepper...,1.68,51600080015,51600080015
8,Van De Kamps,9,Van De Kamps Fillets Beer Battered - 10 Ct,1.79,19600923015,19600923015
9,Ahold,10,Ahold Cocoa Almonds,3.17,688267141676,688267141676


## Exercise 2

In [10]:
domain = 'https://python.zgulde.net'
endpoint = '/api/v1/stores'

while endpoint != None:
    stores = []
    url = domain + endpoint
    data = requests.get(url).json()
    items.extend(data['payload']['stores'])
    endpoint = data['payload']['next_page']
    df2 = pd.DataFrame(stores)


In [12]:
df2.head()