### Import needed libraries

In [1]:
!!pip freeze # shows you all the packages installed in your local environment

['adium-theme-ubuntu==0.3.4',
 'attrs==15.2.0',
 'autoenv==1.0.0',
 'backports.shutil-get-terminal-size==1.0.0',
 'beautifulsoup4==4.6.0',
 'bleach==2.0.0',
 'bs4==0.0.1',
 'chardet==2.3.0',
 'click==6.7',
 'configparser==3.5.0',
 'cryptography==1.2.3',
 'decorator==4.0.6',
 'deluge==1.3.12',
 'entrypoints==0.2.3',
 'enum34==1.1.2',
 'functools32==3.2.3.post2',
 'futures==3.1.1',
 'html5lib==0.999999999',
 'idna==2.0',
 'ipaddress==1.0.16',
 'ipykernel==4.6.1',
 'ipyparallel==6.0.2',
 'ipython==5.4.1',
 'ipython-genutils==0.2.0',
 'ipywidgets==6.0.0',
 'itsdangerous==0.24',
 'Jinja2==2.9.6',
 'jsonschema==2.6.0',
 'jupyter==1.0.0',
 'jupyter-client==5.1.0',
 'jupyter-console==5.1.0',
 'jupyter-core==4.3.0',
 'MarkupSafe==1.0',
 'mistune==0.7.4',
 'nbconvert==5.2.1',
 'nbformat==4.3.0',
 'notebook==5.0.0',
 'numpy==1.11.0',
 'PAM==0.4.2',
 'pandocfilters==1.4.1',
 'pathlib2==2.3.0',
 'pbr==3.1.1',
 'pexpect==4.0.1',
 'pickleshare==0.7.4',
 'prompt-toolkit==1.0.14',
 'ptyprocess==0.5',
 

In [None]:
import json
import requests
import time

### Determine the number of items in the USDA database 

We will be using this marker as well as the lastupdated date to initiate a scan for new data.  running the webiste through internetarchives will do the trick.

### Assign variables to Variables 

These will be used later in the code. In the future, we may want to import these variables from a .yaml or .json file. Configurations should be imported or extracted from a parsable, human-friendly config file. When setting up complex systems, it's nice to have configuration files thoughtfully organized.

First let's define some functions that will be helpful.  These values are stored in `food-app-database/instance/config.py`

*NDBNO_TOTAL* is the last recorded number of items in the USDA database.  This should be recorded in instances/config.py

*LAST_SR* is the version number of the USDA Database

*API_KEY* is the identifier acquired through USDA api services

*q* is the search term (any string) for the search API.  We'll most likely leave this blank

*ds* is the datasource.  Must be 'Branded Food Products', 'Standard Reference', or ''.  We'll leave it blank to include both.

*fg* is the Food group ID.  We'll also leave this blank

*sort* the results by food name (n) or by search relevance (r).  We'll sort by food name (n) for standardization's sake.

*max* refers to the maximum number of items to return.  This seems to max out at 150.

*offset* determines the index of the beginning row in the results set to begin.  

*format* can either be JSON ('json') or XML ('xml).  We'll stick with JSON.

In [2]:
NDBNO_TOTAL = 200000
LAST_SR = 28
API_KEY = '7WqOHQdC2shEfBrx25bIEwxBkvUkYTHMoHYlLWL8'
q = ''
ds = ''
fg = ''
sort = 'n'
max = 1500 # apparently this maxes out at 1500..
offset = 1 # start at index of 1
format = 'json' 
q = '' 

# special api request to get meta information on database (total number of items, standard reference version)
initial_search_request = 'https://api.nal.usda.gov/ndb/search/?format=json&q=&sort=n&max=1&offset=0&api_key=API_KEY'

NameError: name 'offset_temp' is not defined

## Let's define some useful functions 

In [None]:
def get_db_status(initial_search_request):
    '''
    initial_search_request = 'https://api.nal.usda.gov/ndb/search/?format=json&q=&sort=n&max=1&offset=0&api_key=DEMO_KEY'
    total_number = total number of items in usda database
    current_sr = Standard Release version of the data being reported
    
    Returns {'total': total_number, 'sr': current_sr}
    '''
    usda_database_check = requests.get(initial_search_request)
    check_json = usda_database_check.json()
    total_number = check_json['list']['total']
    current_sr = check_json['list']['sr']
    return {'total': total_number, 'current_sr': current_sr}

def check_dbitem_changed(NDBNO_TOTAL, current_ndbno, LAST_SR, current_sr): # can this be a wrapper?
    '''Returns True if Database has a different number of items
    '''
    if NDBNO_TOTAL != current_ndbno:
        return True
    elif LAST_SR != current_sr:
        return True
    else:
        return False

def get_ndbno(current_ndbno, search_api_request, format, q, sort, max, offset, API_KEY):
    '''This returns a list of ndbno's in the usda foods database
    '''
    search_object = requests.get(search_api_request) 
    search_json = search_object.json()
    
    ndbno_list_dict = search_json['list']['item'] # ndbno_list_dict is a list of dictionaries, where each dictionary is a unique food item
    
    ndbno_list = []
    
    # unpack ndbno_list_dict
    for item in ndbno_list_dict:
        for key, value in item.items():
            ndbno_list.append(item[key])
    
    return ndbno_list

## Let's find the number of items in the USDA database

In [None]:
db_status = get_db_status(initial_search_request)
for key, value in db_status.items():
    print(key, value)    

## Let's see if there's been an update in the USDA database

In [None]:
print(check_dbitem_changed(NDBNO_TOTAL, db_status['total'], LAST_SR, db_status['current_sr']))

## If there's been an update, let's get a list of all the ndbno's

In [None]:
# total number of items in database
current_ndbno = db_status['total']
print(current_ndbno)

# total amount that we can search 
print(max)

# determine how many number of API request for searches will need to be done
import math
# search_num = math.ceil(current_ndbno/max)
search_num = 1
print(search_num)

# for each search, we need to keep track of the offset counter
offset_counter = 0
ndbno_list = []


for int in range(0, search_num):
    # determine offset
    offset_temp = offset + (offset_counter * 1500) 
    temp_ndbno_list = get_ndbno(current_ndbno, search_api_request, format, q, sort, max, offset, API_KEY)
    
    # search API
    search_api_request = 'https://api.nal.usda.gov/ndb/search/?format={}&q={}&sort={}&max={}&offset={}&api_key={}'.format(format, q, sort, max, offset_temp, API_KEY)
    
    for item in temp_ndbno_list:
        ndbno_list.append(item)
    # increase offset counter
    offset_counter += 1

print(ndbno_list)

## Let's crossreference this with all the ndbno's in the USDA food's database

There's should be a better way to do this without downloading all the ndbno's each time...

## Now let's get a list of all the new ndbno's

Let's fetch the JSON data of each of the new ndbno's

In [None]:
type = 'f' # get full reports only

usda_database_check = requests.get(initial_search_request)
check_json = usda_database_check.json()

print(check_json)

items_total = check_json['list']['total'] # total number of species
print(items_total)

# get all ndbno in database
get_all_items_api_request = 'https://api.nal.usda.gov/ndb/search/?format=json&q=&sort=n&max=1&offset=0&api_key=DEMO_KEY'

In [None]:
example = requests.get(example_food_comp_api_request)
print(type(example), example)

In [None]:
example_json = example.json()

print(type(example_json), example_json)

In [None]:
start_url = "https://ndb.nal.usda.gov/ndb/search/list"
api_key1 = '7WqOHQdC2shEfBrx25bIEwxBkvUkYTHMoHYlLWL8'
api_key2 = 'avWkKpCJmnTjviy9vfKmsaZkGALXNabV05zUyXQ1'

example_search_api_request = 'https://api.nal.usda.gov/ndb/search/?format=json&q=butter&max=25&offset=0&api_key=DEMO_KEY'
example_food_comp_api_request = 'https://api.nal.usda.gov/ndb/reports/?ndbno=01009&type=f&format=xml&api_key=DEMO_KEY'
                                
                              

html_target = "a"
tag = "href"
f_ext = ".csv"
dir_name = "..."

### Request and Collect

We instantiate a request object and call the `.get` method on it. `r` is our `HTTP 1.1` response. 

From here we have:

*    status
*    encoding
*    text of the body --- should type check this
*    content of the body --- type binary

Once we have our `html` we are ready to scrape the site for useful `href` tags`



In [None]:
result = requests.get(start_url)

status = result.status_code
encoding = result.encoding
html_doc = result.text
c = result.content

print(status, encoding)

Now let's create a beautifulsoup object out of the request object so we can more easily navigate the html

In [None]:
soup = BeautifulSoup(c, 'lxml')

In [None]:
print(soup.prettify())

In [None]:
soup.title

In [None]:
soup.title.name

In [None]:
soup.title.string

In [None]:
soup.title.parent.name

In [None]:
soup.p

In [None]:
soup.a

#### download_url(url, endpoint)

This function makes a new request each time it's called. It writes the binary content to file.
This could be two functions:
1. Get the new request object/content 
2. Write that content to file 

In [None]:
def download_url(url, endpoint):
    """
    return: None
    downloads file, requires url in global or class scope.
    """
    url_addr = "{url}/{endpoint}".format(url=url, endpoint=endpoint)
    file_path = "{directory}/{endpoint}".format(directory=dir_name, endpoint=endpoint)
    
    r = requests.get(url_addr)
    content_file = r.content
    
    with open(file_path, 'wb') as f:
        print """Downloading From: {url}\nWriting to: {file_path}""".format(
                                                url=url_addr, 
                                                file_path=file_path
                                                                    )
        f.write(content_file)