# Exercises - Data Aquistion

All of the exercises for this module should be done within your `ds-methodologies` repository, inside of a directory named `time_series`.

The end result of this exercise should be a file named `acquire.py`.

In [1]:
from pprint import pprint
import requests
import pandas as pd
import json

In [2]:
from debug import local_settings, timeifdebug, timeargsifdebug, frame_splain

import acquire as acq

In [None]:
local_settings.deb

1. **Using the code from the lesson as a guide, create a dataframe named `items` that has all of the data for items.**

In [3]:
base_url = 'https://python.zach.lol'

In [4]:
sect_urls = {
    'items': {
        'sect_key': 'items', 
        'api_url': '/api/v1/items',
        'idx_col': 'item_id', 
        'page_beg': 1,
        'csv_name': 'items.csv'
    },
    'stores': {
        'sect_key': 'stores', 
        'api_url': '/api/v1/stores', 
        'idx_col': 'store_id', 
        'page_beg': 1, 
        'csv_name': 'stores.csv'
    },
    'sales': {
        'sect_key': 'sales', 
        'api_url': '/api/v1/sales', 
        'idx_col': 'sale_id', 
        'page_beg': 1, 
        'csv_name': 'sales.csv'
    },
}

In [5]:
def get_val_from_key(dict, key='key', keys=None):
    if keys is None:
        keys = dict.keys()
    return dict[key] if key in keys else None

In [6]:
def get_json_payload_data(
    target_table='items',
    base_url='https://python.zach.lol',
    sect_key='items',
    api_url='/api/v1/items',
    idx_col='item_id',
    beg_page=1,
    csv_name='items.csv',
    data_key='payload',
    status_key='status',
    on_page_key='page',
    of_pages_key='max_page',
    url_next_key='next_page',
    url_prev_key='previous_page',
    url_get_page='?page=',
    sep=',',
    to_csv=False,
    show_log=False,
    ):
    
    # Setup df
    df = pd.DataFrame()

    # Initialize variables
    cur_page = beg_page
    pages_max = -1
    keep_going = True
    
    # Get initial page
    next_url = api_url
    while keep_going:
        is_complete = False
        get_url = base_url + next_url
        
        # Get webpage
        if show_log:
            print('Fetching page', get_url)
        response = requests.get(get_url)
        
        # Get JSON data
        response_json = response.json()
        json_keys = response_json.keys()
        
        # Get payload
        payload = get_val_from_key(dict=response_json, key=data_key, keys=json_keys)
        if payload is None:
            break

        payload_keys = payload.keys()
        
        # Set navigation values
        on_page = get_val_from_key(dict=payload, key=on_page_key, keys=payload_keys)
        of_pages = get_val_from_key(dict=payload, key=of_pages_key, keys=payload_keys)
        url_next = get_val_from_key(dict=payload, key=url_next_key, keys=payload_keys)
        url_prev = get_val_from_key(dict=payload, key=url_prev_key, keys=payload_keys)
        
        # Get target data
        target_data = get_val_from_key(dict=payload, key=sect_key, keys=payload_keys)
        if target_data is None:
            break
        
        # Make page dataframe
        page_df = pd.DataFrame(target_data)
        if idx_col in page_df.columns:
            page_df.set_index(idx_col, inplace=True)
        else:
            print('index is missing')
            break
        
        df = df.append(page_df, verify_integrity=True)

        if url_next is None:
            keep_going = False
            
        if keep_going:
            next_url = url_next
        else:
            next_url = None
        
        is_complete = True
        
    if to_csv:
        df.to_csv(
            path_or_buf=csv_name, 
            sep=sep, 
            index=True, 
            index_label=idx_col, 
        )
        
    return df
    

In [7]:
def output_payload_data(
    target_table='items', 
    base_url='https://python.zach.lol',
    to_csv=False,
    show_log=False,
    sect_url_keys=sect_urls,
    sep=',',
    **kwargs):
    
    url_keys = sect_url_keys[target_table]
    sect_key = url_keys['sect_key']
    api_url = url_keys['api_url']
    idx_col = url_keys['idx_col']
    beg_page=url_keys['page_beg']
    csv_name=url_keys['csv_name']
    
    df = get_json_payload_data(
        target_table=target_table,
        base_url=base_url,
        sect_key=sect_key,
        api_url=api_url,
        idx_col=idx_col,
        beg_page=beg_page,
        csv_name=csv_name,
        data_key='payload',
        status_key='status',
        on_page_key='page',
        of_pages_key='max_page',
        url_next_key='next_page',
        url_prev_key='previous_page',
        url_get_page='?page=',
        sep=sep,
        to_csv=to_csv,
        show_log=show_log,
    )
    
    return df


In [8]:
items_df = acq.output_payload_data(
    target_table='items', 
    base_url=base_url, 
    to_csv=True,
    show_log=True,
    sect_url_keys=sect_urls,
    sep=',',
    debug=True
)

Fetching page https://python.zach.lol/api/v1/items
Fetching page https://python.zach.lol/api/v1/items?page=2
Fetching page https://python.zach.lol/api/v1/items?page=3


In [9]:
stores_df = acq.output_payload_data(
    target_table='stores', 
    base_url=base_url, 
    to_csv=True,
    show_log=True,
    sect_url_keys=sect_urls,
    sep=',',
    debug=True
)

Fetching page https://python.zach.lol/api/v1/stores


In [10]:
sales_df = acq.output_payload_data(
    target_table='sales', 
    base_url=base_url, 
    to_csv=True,
    show_log=True,
    sect_url_keys=sect_urls,
    sep=',',
    debug=True
)

Fetching page https://python.zach.lol/api/v1/sales
Fetching page https://python.zach.lol/api/v1/sales?page=2
Fetching page https://python.zach.lol/api/v1/sales?page=3
Fetching page https://python.zach.lol/api/v1/sales?page=4


KeyboardInterrupt: 

In [None]:
sales_df.head()

In [None]:
sales_df.item.value_counts()

In [None]:
sales_df.sale_date.value_counts()

In [None]:
sales_df.store.value_counts()

2. **Do the same thing, but for `stores`.**

3. **Extract the data for `sales`. There are a lot of pages of data here, so your code will need to be a little more complex. Your code should continue fetching data from the next page until all of the data is extracted.**

4. **Save the data in your files to local csv files so that it will be faster to access in the future.**

5. **Combine the data from your three separate dataframes into one large dataframe.**

6. **Acquire the Open Power Systems Data for Germany, which has been rapidly expanding its renewable energy production in recent years. The data set includes country-wide totals of electricity consumption, wind power production, and solar power production for 2006-2017. You can get the data here:** https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv

7. **Make sure all the work that you have done above is reproducible. That is, you should put the code above into separate functions in the `acquire.py` file and be able to re-run the functions and get the same data.**