## Import needed libraries

In [None]:
!!pip freeze # shows you all the packages installed in your local environment

In [None]:
import json
import requests
import time

## Assign variables to Variables 

These will be used later in the code. In the future, we may want to import these variables from a .yaml or .json file. Configurations should be imported or extracted from a parsable, human-friendly config file. When setting up complex systems, it's nice to have configuration files thoughtfully organized.

First let's define some variables that will be helpful.  These values are stored in `food-app-database/instance/config.py`

*NDBNO_TOTAL* is the last recorded number of items in the USDA database.  

*LAST_SR* is the version number of the USDA Database

*API_KEY* is the identifier acquired through USDA api services

*q* is the search term (any string) for the search API.  We'll most likely leave this blank.

*ds* is the datasource.  Must be 'Branded Food Products', 'Standard Reference', or ''.  We'll leave it blank to include both.

*fg* is the Food group ID.  We'll also leave this blank.

*sort* the results by food name (n) or by search relevance (r).  We'll sort by food name (n) for standardization's sake.

*mx* refers to the maximum number of items to return.  This seems to max out at 1500.

*offset* determines the index of the beginning row in the results set to begin.  

*formt* can either be JSON ('json') or XML ('xml).  We'll stick with JSON.

In [None]:
LAST_NDBNO_TOTAL = 200000
LAST_SR = 28
current_ndbo_total = 0
current_sr = 0
API_KEY = '7WqOHQdC2shEfBrx25bIEwxBkvUkYTHMoHYlLWL8' #1000 requests/hour
q = ''
ds = ''
fg = ''
sort = 'n'
mx = 5 # max is 1500 
offset = '0'
formt = 'json' 
q = '' 
typ = 'f'

In [None]:
# special api request to get meta information on database (total number of items, standard reference version)
initial_search_request = 'https://api.nal.usda.gov/ndb/search/?format=json&q=&sort=n&max=1&offset=0&api_key=7WqOHQdC2shEfBrx25bIEwxBkvUkYTHMoHYlLWL8'

# search API
search_api_request = 'https://api.nal.usda.gov/ndb/search/?format={}&q={}&sort={}&max={}&offset={}&api_key={}'.format(formt, q, sort, mx, offset, API_KEY)

# Get JSON report with API
report_api_request = 'https://api.nal.usda.gov/ndb/reports/?ndbno={}&type={}&format={}&api_key={}'.format(ndbno_id, typ, formt, API_KEY)

## Let's define some useful functions 

In [None]:
def get_db_status(initial_search_request):
    '''
    initial_search_request = 'https://api.nal.usda.gov/ndb/search/?format=json&q=&sort=n&max=1&offset=0&api_key=DEMO_KEY'
    current_total = total number of items in usda database at the time of request
    current_sr = Standard Release version of the data at the time of request
    
    Returns {'current_total': current_total, 'current_sr': current_sr}
    '''
    usda_database_check = requests.get(initial_search_request)
    check_json = usda_database_check.json()
    current_total = check_json['list']['total']
    current_sr = check_json['list']['sr']
    return {'current_total': current_total, 'current_sr': current_sr}

def check_dbitem_changed(LAST_NDBNO_TOTAL, current_ndbno_total, LAST_SR, current_sr): #TODO: convert to decorator
    '''Returns True if Database has a different number of items
    '''
    if LAST_NDBNO_TOTAL != current_ndbno_total:
        return True
    elif LAST_SR != current_sr:
        return True
    else:
        return False

def get_ndbno_list(search_api_request, formt, q, sort, mx, offset, API_KEY):
    '''This returns a list of ndbno's in the usda foods database
    '''
    search_object = requests.get(search_api_request) 
    search_json = search_object.json() # convert search_object to JSON

    ndbno_list_dict = search_json['list']['item'] # ndbno_list_dict is a list of dictionaries, where each dictionary is a unique food item
    
    return ndbno_list_dict

def get_ndbno_full_report(report_api_request, ndbno_id):
    '''Returns JSON Full Report 
    '''
    full_report = requests.get(report_api_request)
    full_report_json = full_report.json()
    return full_report_json
    

## Determine the number of items in the USDA database 

We will be using this marker as well as the lastupdated date to initiate a scan for new data.  running the webiste through internetarchives will do the trick.

In [None]:
db_status = get_db_status(initial_search_request)
current_ndbno_total = db_status['current_total']
current_sr = db_status['current_sr']
print("Current Number of Items in database: ", current_ndbno_total)
print("Current Standard Reference Database Version: ", current_sr)

## Grab Metadata for items in the USDA database

In [None]:
# total amount that we can search 
print("The maximum number of items we can pull from a request is: ", mx)

# determine how many number of API request for searches will need to be done
import math
# search_num = math.ceil(current_ndbno_total/mx) # to round up!
search_num = 1 # for testing

# for each search, we need to keep track of the offset counter
offset_counter = 0

ndbno_list = []

for int in range(0, search_num):
    # determine offset
    offset_temp = str(offset + (offset_counter * mx)) 
    
    temp_ndbno_list = get_ndbno_list(search_api_request, formt, q, sort, mx, offset, API_KEY)
    
    for item in temp_ndbno_list:
        ndbno_list.append(item)
        
    # increase offset counter
    offset_counter += 1

# print results
for item in ndbno_list:
    print(item)

### Create database instances for food_desc, food_unit, food_upc, nut_per_100_g for each item in pull request

In [None]:
print(check_dbitem_changed(LAST_NDBNO_TOTAL, current_ndbno_total, LAST_SR, current_sr))

## Let's see if there's been an update in the USDA database

## If there's been an update, let's get a list of all the ndbno's in the database at this current time

In [None]:
# db.create_all()


# let's unpack data from JSON
for item in ndbno_list:
    
    # let's get the metadata
    name = item['name'][:-19] 
    upc = item['name'][-12:]
    ndbno_id = item['ndbno']
    food_cat = item['group']
    
    # print(name, upc, ndbno_id, food_cat) 
    
    # let's get the full report json data
    # print(ndbno_id, type(ndbno_id))
    # print(report_api_request)
    full_report_json = get_ndbno_full_report(report_api_request, ndbno_id)
    print(full_report_json)
    
    short_desc = full_report_json['report']['food']['ing']['desc']
    updated = full_report_json['report']['food']['ing']['upd']
    
    # print(short_desc, updated)
        
    # create classes instantiations
    unit_desc
    grams_per_unit



## Let's crossreference this with all the ndbno's in the USDA food's database

There's should be a better way to do this without downloading all the ndbno's full report.  The problem is that the update information is located in the full report, and not in the metadata.  This is bad design...

## Now let's get a list of all the new ndbno's

Let's fetch the JSON data of each of the new ndbno's

In [None]:
type = 'f' # get full reports only

usda_database_check = requests.get(initial_search_request)
check_json = usda_database_check.json()

print(check_json)

items_total = check_json['list']['total'] # total number of species
print(items_total)

# get all ndbno in database
get_all_items_api_request = 'https://api.nal.usda.gov/ndb/search/?format=json&q=&sort=n&max=1&offset=0&api_key=DEMO_KEY'

In [None]:
example = requests.get(example_food_comp_api_request)
print(type(example), example)

In [None]:
example_json = example.json()

print(type(example_json), example_json)

In [None]:
start_url = "https://ndb.nal.usda.gov/ndb/search/list"
api_key1 = '7WqOHQdC2shEfBrx25bIEwxBkvUkYTHMoHYlLWL8'
api_key2 = 'avWkKpCJmnTjviy9vfKmsaZkGALXNabV05zUyXQ1'

example_search_api_request = 'https://api.nal.usda.gov/ndb/search/?format=json&q=butter&max=25&offset=0&api_key=DEMO_KEY'
example_food_comp_api_request = 'https://api.nal.usda.gov/ndb/reports/?ndbno=01009&type=f&format=xml&api_key=DEMO_KEY'
                                
                              

html_target = "a"
tag = "href"
f_ext = ".csv"
dir_name = "..."

### Request and Collect

We instantiate a request object and call the `.get` method on it. `r` is our `HTTP 1.1` response. 

From here we have:

*    status
*    encoding
*    text of the body --- should type check this
*    content of the body --- type binary

Once we have our `html` we are ready to scrape the site for useful `href` tags`



In [None]:
result = requests.get(start_url)

status = result.status_code
encoding = result.encoding
html_doc = result.text
c = result.content

print(status, encoding)

Now let's create a beautifulsoup object out of the request object so we can more easily navigate the html

In [None]:
soup = BeautifulSoup(c, 'lxml')

In [None]:
print(soup.prettify())

In [None]:
soup.title

In [None]:
soup.title.name

In [None]:
soup.title.string

In [None]:
soup.title.parent.name

In [None]:
soup.p

In [None]:
soup.a

#### download_url(url, endpoint)

This function makes a new request each time it's called. It writes the binary content to file.
This could be two functions:
1. Get the new request object/content 
2. Write that content to file 

In [None]:
def download_url(url, endpoint):
    """
    return: None
    downloads file, requires url in global or class scope.
    """
    url_addr = "{url}/{endpoint}".format(url=url, endpoint=endpoint)
    file_path = "{directory}/{endpoint}".format(directory=dir_name, endpoint=endpoint)
    
    r = requests.get(url_addr)
    content_file = r.content
    
    with open(file_path, 'wb') as f:
        print """Downloading From: {url}\nWriting to: {file_path}""".format(
                                                url=url_addr, 
                                                file_path=file_path
                                                                    )
        f.write(content_file)