In [27]:
"""This notebook scrapes the USDA market research website for historical wholesale produce data
This is done by linking directly to a server request output in html form
Only one year of data can be requested at a time so this notebook loops over years.

To use this in the future don't forget to: look up the short city code for each city, as well as short produce name.
"""

"This notebook scrapes the USDA market research website for historical wholesale produce data\nThis is done by linking directly to a server request output in html form\nOnly one year of data can be requested at a time so this notebook loops over years.\n\nTo use this in the future don't forget to: look up the short city code for each city, as well as short produce name.\n"

In [28]:
import requests
import os
import time
import random
import sys
import pandas as pd

### TODO: ###
- Edit `fetch_data()` to fetch the latest week of data by changing the parameter repDate= to the current date in the URL (will need to use datetime and `.now()` to get the current date)

In [30]:
def fetch_data(producename, regionname, year, directory):
    """Given a region and produce item, fetches a year of data and saves it in a target directory.
    Skips any cities/items/year combos that have already been downloaded. Slightly hardened against 
    timeouts,etc. from the USDA server, which is a bit flaky.
    """
    if str(producename)+'_'+str(regionname)+'_'+str(year)+'.html' not in os.listdir(directory):

        url = 'https://www.marketnews.usda.gov/mnp/fv-report-retail?repType=&run=&portal=fv&locChoose=&commodityClass=&startIndex=1&type=retail&class=ALL&commodity='+str(producename)+'&region='+str(regionname)+'&organic=ALL&repDate=01%2F01%2F'+str(year)+'&endDate=12%2F31%2F'+str(year)+'&compareLy=No&format=excel&rowDisplayMax=100000'

        try:
            r = requests.get(url, allow_redirects=True, timeout=300)
            open(str(directory) + str(producename)+'_'+str(regionname)+'_'+str(year)+'.html', 'wb').write(r.content)
            return True
        except requests.exceptions.Timeout:
            print('request timed out, trying again...')
            try:
                r = requests.get(url, allow_redirects=True, timeout=300)
                open(str(directory) + str(producename)+'_'+str(regionname)+'_'+str(year)+'.html', 'wb').write(r.content)
                return True
            except requests.exceptions.Timeout:
                print('request timed out again, exiting...')
                sys.exit()
    else:
        return False

In [31]:
# test grabbing a single item for 1/1/21 to 1/22/21
# test_city = 'NEW+YORK'
test_regions = ['NORTHEAST+U.S.', 'SOUTHWEST+U.S.']
#test_short = 'NX'
test_producenames = ['CARROTS', 'APPLES']
test_year = ['2021']

In [32]:
#fetch_data(test_city, test_producename, test_year)
#veggies = ['APPLES','APRICOTS','ASPARAGUS','AVOCADOS','BANANAS','BEANS','BEETS','BLACKBERRIES','BLUEBERRIES','BROCCOLI','BRUSSELS+SPROUTS','CABBAGE','CANTALOUPS','CARROTS','CAULIFLOWER','CELERY','CHERRIES','CLEMENTINES','CORN-SWEET','CRANBERRIES', 'CUCUMBERS','EGGPLANT','ENDIVE','GARLIC','GINGER+ROOT','GRAPEFRUIT','GRAPES','HONEYDEWS','KALE+GREENS','KIWIFRUIT','KOHLRABI','LEMONS','LETTUCE%2C+ICEBERG','LETTUCE%2C+ROMAINE','LETTUCE%2C+RED+LEAF','LETTUCE%2C+GREEN+LEAF','LETTUCE%2C+BIBB']
#shortveg = ['APL','APR','ASP','AVOC','BAN','BNS','BTS','BLKBERI-V','BLUBY','BROC','BRSPT','CAB','CANT','CARR','CAUL','CEL','CHER','CLEM','CORN','CRBY','CUX','EGPLT','END','GARLIC','GNGRT','GRPFT','GRPS','HDEW','KALEGRNS','KIWI','KOHLRABI','LEM','LETT','LETTR','RDLFLET-V','GRNLFLET-V','BIBBLET-V']

### Scrape parameters ###
In the next cell, you can add the veggies, regions, and years you want to scrape for.
In the updated version of the scrape, only veggies, regions, and years matter.
The scrape is hard-coded to scrape a whole year (1/1 to 12/31) except for the current year 2021. (There will be another version that just gets 2021 up to present)
This version scrapes all the data that we will need as of 1/22/21 back 10 years, to calculate all the averages. That should only be run once and manually dumped into Mongo; in the future a version of this will be set to scrape for the most recently updated week every Friday and run it through the pipeline (calculate new rolling averages, etc.) and dump into Mongo, all on the cloud using the cloud function.

All veggies are included since they want to know which specific veggies to go after.

In [33]:
# loop over a few veggies/years/cities and dump into directory
veggies = ['APPLES','APRICOTS','ASPARAGUS','AVOCADOS','BANANAS','BEANS','BEETS','BLACKBERRIES','BLUEBERRIES','BROCCOLI','BRUSSELS+SPROUTS','CABBAGE','CANTALOUPS','CARROTS','CAULIFLOWER','CELERY','CHERRIES','CLEMENTINES','CORN-SWEET','CRANBERRIES', 'CUCUMBERS','EGGPLANT','ENDIVE','GARLIC','GINGER+ROOT','GRAPEFRUIT','GRAPES','HONEYDEWS','KALE+GREENS','KIWIFRUIT','KOHLRABI','LEMONS','LETTUCE%2C+ICEBERG','LETTUCE%2C+ROMAINE','LETTUCE%2C+RED+LEAF','LETTUCE%2C+GREEN+LEAF','LETTUCE%2C+BIBB', 'LIMES','MANGOES','MUSHROOMS','NECTARINES','OKRA','ORANGES','PEACHES','PEARS','PEAS+GREEN','PEPPERS%2C+BELL+TYPE','PINEAPPLES','PLUMS','POTATOES','PUMPKINS','RADISHES','RASPBERRIES','RHUBARB','SPINACH','SQUASH','STRAWBERRIES','SWEET+POTATOES','TOMATOES','TURNIPS','WATERMELONS']
# shortveg = ['APL','APR','ASP','AVOC','BAN','BNS','BTS','BLKBERI-V','BLUBY','BROC','BRSPT','CAB','CANT','CARR','CAUL','CEL','CHER','CLEM','CORN','CRBY','CUX','EGPLT','END','GARLIC','GNGRT','GRPFT','GRPS','HDEW','KALEGRNS','KIWI','KOHLRABI','LEM','LETT','LETTR','RDLFLET-V','GRNLFLET-V','BIBBLET-V','LIM','MANGO','MUSH','NECT','OKRA','ORG','PCH','PEAR','PEASG','PEP','PINE','PLUM','POTS','PUMP','RAD','RASP','RHUB','SPIN','SQU','STRBY','SWPOT','TOM','TRNP','WMEL']
regions = ['NORTHEAST+U.S.', 'SOUTHEAST+U.S.', 'MIDWEST+U.S.', 'SOUTH+CENTRAL+U.S.', 'SOUTHWEST+U.S.', 'NORTHWEST+U.S.', 'HAWAII', 'ALASKA']
#shortveg = ['APL','APR','ASP','AVOC','BAN','BNS','BTS','BLKBERI-V','BLUBY','BROC']
years = ['2011','2012','2013','2014','2015','2016','2017','2018','2019', '2020']

targdir = './raw_data/'

In [34]:
# loop over all the produce items, cities specified above. this can take a while. 
initial_time = time.time()
count_region = 0
for reg in test_regions:
    count_veg = 0
    for v in test_producenames:
        for y in test_year:
            random.seed()
            sleeptime = random.randint(8, 18)
            try:
                if fetch_data(v, reg, y, targdir):
                    print('fetched '+str(reg)+' '+str(v)+' '+str(y)+', sleeping for '+str(sleeptime)+'s')
                    time.sleep(sleeptime)
            except OSError:
                print('problem fetching, trying again...')
                try:
                    if fetch_data(v, reg, y, targdir):
                        print('fetched '+str(reg)+' '+str(v)+' '+str(y)+', sleeping for '+str(sleeptime)+'s')
                        time.sleep(sleeptime)
                except OSError:
                    print('problem fetching, trying one last time...')
                    if fetch_data(v, reg, y, targdir):
                        print('fetched '+str(reg)+' '+str(v)+' '+str(y)+', sleeping for '+str(sleeptime)+'s')
                        time.sleep(sleeptime)        
        count_veg+=1
    count_region+=1
final_time = time.time()
print('fetched in '+str(round(final_time-initial_time, 2))+'s')

fetched NORTHEAST+U.S. APPLES 2021, sleeping for 17s
fetched in 18.06s
