# Exercises - Data Aquistion

All of the exercises for this module should be done within your `ds-methodologies` repository, inside of a directory named `time_series`.

The end result of this exercise should be a file named `acquire.py`.

In [1]:
from pprint import pprint
import requests
import pandas as pd
import json

from os import path

In [2]:
from debug import local_settings, timeifdebug, timeargsifdebug, frame_splain

import acquire as acq


In [3]:
local_settings.debug = True

1. **Using the code from the lesson as a guide, create a dataframe named `items` that has all of the data for items.**

2. **Do the same thing, but for `stores`.**

3. **Extract the data for `sales`. There are a lot of pages of data here, so your code will need to be a little more complex. Your code should continue fetching data from the next page until all of the data is extracted.**

4. **Save the data in your files to local csv files so that it will be faster to access in the future.**

5. **Combine the data from your three separate dataframes into one large dataframe.**

In [4]:
base_url = 'https://python.zach.lol'

In [5]:
sect_urls = {
    'items': {
        'sect_key': 'items', 
        'api_url': '/api/v1/items',
        'idx_col': 'item_id', 
        'page_beg': 1,
        'csv_name': 'items.csv'
    },
    'stores': {
        'sect_key': 'stores', 
        'api_url': '/api/v1/stores', 
        'idx_col': 'store_id', 
        'page_beg': 1, 
        'csv_name': 'stores.csv'
    },
    'sales': {
        'sect_key': 'sales', 
        'api_url': '/api/v1/sales', 
        'idx_col': 'sale_id', 
        'page_beg': 1, 
        'csv_name': 'sales.csv'
    },
}

In [6]:
def get_val_from_key(dict, key='key', keys=None):
    if keys is None:
        keys = dict.keys()
    return dict[key] if key in keys else None

In [7]:
def get_json_payload_data(
    target_table='items',
    base_url='https://python.zach.lol',
    sect_key='items',
    api_url='/api/v1/items',
    idx_col='item_id',
    beg_page=1,
    csv_name='items.csv',
    data_key='payload',
    status_key='status',
    on_page_key='page',
    of_pages_key='max_page',
    url_next_key='next_page',
    url_prev_key='previous_page',
    url_get_page='?page=',
    sep=',',
    to_csv=False,
    show_log=False,
    ):
    
    # Setup df
    df = pd.DataFrame()

    # Initialize variables
    cur_page = beg_page
    pages_max = -1
    keep_going = True
    
    # Get initial page
    next_url = api_url
    while keep_going:
        is_complete = False
        get_url = base_url + next_url
        
        # Get webpage
        if show_log:
            print('Fetching page', get_url)
        response = requests.get(get_url)
        
        # Get JSON data
        response_json = response.json()
        json_keys = response_json.keys()
        
        # Get payload
        payload = get_val_from_key(dict=response_json, key=data_key, keys=json_keys)
        if payload is None:
            break

        payload_keys = payload.keys()
        
        # Set navigation values
        on_page = get_val_from_key(dict=payload, key=on_page_key, keys=payload_keys)
        of_pages = get_val_from_key(dict=payload, key=of_pages_key, keys=payload_keys)
        url_next = get_val_from_key(dict=payload, key=url_next_key, keys=payload_keys)
        url_prev = get_val_from_key(dict=payload, key=url_prev_key, keys=payload_keys)
        
        # Get target data
        target_data = get_val_from_key(dict=payload, key=sect_key, keys=payload_keys)
        if target_data is None:
            break
        
        # Make page dataframe
        page_df = pd.DataFrame(target_data)
        if idx_col in page_df.columns:
            page_df.set_index(idx_col, inplace=True)
        else:
            print('index is missing')
            break
        
        df = df.append(page_df, verify_integrity=True)

        if url_next is None:
            keep_going = False
            
        if keep_going:
            next_url = url_next
        else:
            next_url = None
        
        is_complete = True
        
    if to_csv:
        df.to_csv(
            path_or_buf=csv_name, 
            sep=sep, 
            index=True, 
            index_label=idx_col, 
        )
        
    return df
    

In [8]:
def output_payload_data(
    target_table='items', 
    base_url='https://python.zach.lol',
    to_csv=False,
    show_log=False,
    sect_url_keys=sect_urls,
    sep=',',
    **kwargs):
    
    url_keys = sect_url_keys[target_table]
    sect_key = url_keys['sect_key']
    api_url = url_keys['api_url']
    idx_col = url_keys['idx_col']
    beg_page=url_keys['page_beg']
    csv_name=url_keys['csv_name']
    
    df = get_json_payload_data(
        target_table=target_table,
        base_url=base_url,
        sect_key=sect_key,
        api_url=api_url,
        idx_col=idx_col,
        beg_page=beg_page,
        csv_name=csv_name,
        data_key='payload',
        status_key='status',
        on_page_key='page',
        of_pages_key='max_page',
        url_next_key='next_page',
        url_prev_key='previous_page',
        url_get_page='?page=',
        sep=sep,
        to_csv=to_csv,
        show_log=show_log,
    )
    
    return df


In [9]:
items_df = acq.output_payload_data(
    target_table='items', 
    base_url=base_url, 
    to_csv=True,
    show_log=True,
    sect_url_keys=sect_urls,
    sep=',',
    debug=True
)

2019-11-13 22:22:54 starting output_payload_data
2019-11-13 22:22:54 starting get_json_payload_data
Fetching page https://python.zach.lol/api/v1/items
Fetching page https://python.zach.lol/api/v1/items?page=2
Fetching page https://python.zach.lol/api/v1/items?page=3
2019-11-13 22:22:55 ending get_json_payload_data ; time: 0:00:00.966151
2019-11-13 22:22:55 ending output_payload_data ; time: 0:00:00.966734


In [10]:
stores_df = acq.output_payload_data(
    target_table='stores', 
    base_url=base_url, 
    to_csv=True,
    show_log=True,
    sect_url_keys=sect_urls,
    sep=',',
    debug=True
)

2019-11-13 22:22:55 starting output_payload_data
2019-11-13 22:22:55 starting get_json_payload_data
Fetching page https://python.zach.lol/api/v1/stores
2019-11-13 22:22:55 ending get_json_payload_data ; time: 0:00:00.306791
2019-11-13 22:22:55 ending output_payload_data ; time: 0:00:00.311423


In [11]:
sales_df = acq.output_payload_data(
    target_table='sales', 
    base_url=base_url, 
    to_csv=True,
    show_log=True,
    sect_url_keys=sect_urls,
    sep=',',
    debug=True
)

2019-11-13 22:22:55 starting output_payload_data
2019-11-13 22:22:55 starting get_json_payload_data
Fetching page https://python.zach.lol/api/v1/sales
Fetching page https://python.zach.lol/api/v1/sales?page=2
Fetching page https://python.zach.lol/api/v1/sales?page=3
Fetching page https://python.zach.lol/api/v1/sales?page=4
Fetching page https://python.zach.lol/api/v1/sales?page=5
Fetching page https://python.zach.lol/api/v1/sales?page=6
Fetching page https://python.zach.lol/api/v1/sales?page=7
Fetching page https://python.zach.lol/api/v1/sales?page=8
Fetching page https://python.zach.lol/api/v1/sales?page=9
Fetching page https://python.zach.lol/api/v1/sales?page=10
Fetching page https://python.zach.lol/api/v1/sales?page=11
Fetching page https://python.zach.lol/api/v1/sales?page=12
Fetching page https://python.zach.lol/api/v1/sales?page=13
Fetching page https://python.zach.lol/api/v1/sales?page=14
Fetching page https://python.zach.lol/api/v1/sales?page=15
Fetching page https://python.za

Fetching page https://python.zach.lol/api/v1/sales?page=138
Fetching page https://python.zach.lol/api/v1/sales?page=139
Fetching page https://python.zach.lol/api/v1/sales?page=140
Fetching page https://python.zach.lol/api/v1/sales?page=141
Fetching page https://python.zach.lol/api/v1/sales?page=142
Fetching page https://python.zach.lol/api/v1/sales?page=143
Fetching page https://python.zach.lol/api/v1/sales?page=144
Fetching page https://python.zach.lol/api/v1/sales?page=145
Fetching page https://python.zach.lol/api/v1/sales?page=146
Fetching page https://python.zach.lol/api/v1/sales?page=147
Fetching page https://python.zach.lol/api/v1/sales?page=148
Fetching page https://python.zach.lol/api/v1/sales?page=149
Fetching page https://python.zach.lol/api/v1/sales?page=150
Fetching page https://python.zach.lol/api/v1/sales?page=151
Fetching page https://python.zach.lol/api/v1/sales?page=152
Fetching page https://python.zach.lol/api/v1/sales?page=153
Fetching page https://python.zach.lol/ap

In [12]:
sales_df.head()

Unnamed: 0_level_0,item,sale_amount,sale_date,store
sale_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1
2,1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",1
3,1,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",1
4,1,13.0,"Fri, 04 Jan 2013 00:00:00 GMT",1
5,1,10.0,"Sat, 05 Jan 2013 00:00:00 GMT",1


In [13]:
sales_df.item.value_counts()

50    18260
13    18260
23    18260
22    18260
21    18260
20    18260
19    18260
18    18260
17    18260
16    18260
15    18260
14    18260
12    18260
49    18260
11    18260
10    18260
9     18260
8     18260
7     18260
6     18260
5     18260
4     18260
3     18260
2     18260
24    18260
25    18260
26    18260
27    18260
48    18260
47    18260
46    18260
45    18260
44    18260
43    18260
42    18260
41    18260
40    18260
39    18260
38    18260
37    18260
36    18260
35    18260
34    18260
33    18260
32    18260
31    18260
30    18260
29    18260
28    18260
1     18260
Name: item, dtype: int64

In [14]:
sales_df.sale_date.value_counts()

Thu, 28 Jan 2016 00:00:00 GMT    500
Sat, 20 Jul 2013 00:00:00 GMT    500
Fri, 10 Apr 2015 00:00:00 GMT    500
Fri, 05 Jul 2013 00:00:00 GMT    500
Mon, 04 Apr 2016 00:00:00 GMT    500
Mon, 30 Oct 2017 00:00:00 GMT    500
Sat, 19 Nov 2016 00:00:00 GMT    500
Sat, 05 Apr 2014 00:00:00 GMT    500
Mon, 01 May 2017 00:00:00 GMT    500
Tue, 08 Dec 2015 00:00:00 GMT    500
Tue, 21 Apr 2015 00:00:00 GMT    500
Wed, 25 Sep 2013 00:00:00 GMT    500
Tue, 26 Nov 2013 00:00:00 GMT    500
Fri, 24 Jan 2014 00:00:00 GMT    500
Mon, 02 Mar 2015 00:00:00 GMT    500
Wed, 11 Jun 2014 00:00:00 GMT    500
Sun, 09 Apr 2017 00:00:00 GMT    500
Wed, 13 Aug 2014 00:00:00 GMT    500
Wed, 02 Mar 2016 00:00:00 GMT    500
Tue, 26 Dec 2017 00:00:00 GMT    500
Sat, 29 Jul 2017 00:00:00 GMT    500
Mon, 11 Apr 2016 00:00:00 GMT    500
Thu, 03 Oct 2013 00:00:00 GMT    500
Thu, 29 May 2014 00:00:00 GMT    500
Fri, 06 Nov 2015 00:00:00 GMT    500
Wed, 05 Oct 2016 00:00:00 GMT    500
Mon, 09 Mar 2015 00:00:00 GMT    500
M

In [15]:
sales_df.store.value_counts()

10    91300
9     91300
8     91300
7     91300
6     91300
5     91300
4     91300
3     91300
2     91300
1     91300
Name: store, dtype: int64

6. **Acquire the Open Power Systems Data for Germany, which has been rapidly expanding its renewable energy production in recent years. The data set includes country-wide totals of electricity consumption, wind power production, and solar power production for 2006-2017. You can get the data here:** https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv

In [16]:
### shamelessly stolen from zach:
from os import path

def get_opsd_data(use_cache=True):
    if use_cache and path.exists('opsd.csv'):
        return pd.read_csv('opsd.csv')
    df = pd.read_csv('https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv')
    df.to_csv('opsd.csv', index=False)
    return df

In [17]:
opsd_df = get_opsd_data()
opsd_df

Unnamed: 0,Date,Consumption,Wind,Solar,Wind+Solar
0,2006-01-01,1069.18400,,,
1,2006-01-02,1380.52100,,,
2,2006-01-03,1442.53300,,,
3,2006-01-04,1457.21700,,,
4,2006-01-05,1477.13100,,,
5,2006-01-06,1403.42700,,,
6,2006-01-07,1300.28700,,,
7,2006-01-08,1207.98500,,,
8,2006-01-09,1529.32300,,,
9,2006-01-10,1576.91100,,,


7. **Make sure all the work that you have done above is reproducible. That is, you should put the code above into separate functions in the `acquire.py` file and be able to re-run the functions and get the same data.**