#### Set program variables

In [None]:
import os
import re

# We know the last year of data available is 2023
last_year = 2023
page_url = "https://www.ers.usda.gov/data-products/fruit-and-vegetable-prices"
data_dir = "/temp-output-directory/alternative/usda/fruitandvegetables/"
os.makedirs(data_dir, exist_ok=True)

def normalize(x):
    '''We need to normalize file names for consistent sorting.'''
    return x.lower().replace('-', '_').replace(' ', '_')

#### Data Downloader

In [None]:
import requests
from bs4 import BeautifulSoup
from zipfile import ZipFile

# Scrape the USDA page to find all XLSX, and ZIP download links
print(f"Scraping {page_url}...")
response = requests.get(page_url)
soup = BeautifulSoup(response.content, 'html.parser')

download_links = []
for a in soup.find_all('a', href=True):
    href = a['href'].lower()
    # Find all links that end with .xlsx, or .zip
    for ext in ['.zip', '.xlsx']:
        if href.endswith(ext) or f'{ext}?' in href:
            download_links.append(a['href'])
            break

print(f"Found {len(download_links)} files to download")

# Download each file
downloaded_files = []
for link in download_links:
    # Construct full URL
    if not link.startswith('http'):
        file_url = page_url[:25] + link
    else:
        file_url = link
    
    extract_dir = "extracted"
    os.makedirs(extract_dir, exist_ok=True)

    # Extract filename from URL
    filename = normalize(file_url.split('/')[-1].split('?')[0])
    
    # Save XLSX files within extracted folder where we will also extract ZIP contents
    if filename.endswith('.xlsx'):
        index = filename.find('average')
        filename = f"extracted/{filename[:index-1]}_{last_year}.xlsx"

    print(f"\nDownloading: {filename}")
    print(f"URL: {file_url}")
    
    try:
        file_response = requests.get(file_url)
        
        if file_response.status_code == 200:
            # Save to file
            with open(filename, "wb") as f:
                f.write(file_response.content)
                if filename.endswith('.zip'):
                    print(f"\n{'='*60}")
                    print(f"Processing: {filename}")
                    print(f"{'='*60}")
                    try:
                        with ZipFile(filename, 'r') as zip_ref:
                            # Extract XLSX files to a temporary directory
                            files = [zip_ref.extract(f, extract_dir) 
                                for f in zip_ref.namelist() if f.endswith('.xlsx')]
                            print(f"XLSX files in archive: {files}")
                    except Exception as e:
                        print(f"Error processing ZIP file: {e}")
            
            print(f"✓ Success - {len(file_response.content)} bytes")
            downloaded_files.append(filename)
        else:
            print(f"✗ Failed - Status code: {file_response.status_code}")
    except Exception as e:
        print(f"✗ Error: {e}")

print(f"\n{'='*60}")
print(f"Download complete: {len(downloaded_files)} files saved")
print(f"{'='*60}")
for f in downloaded_files:
    print(f"  • {f}")

#### Data Processor

In [None]:
import openpyxl
from itertools import groupby

def get_hash(x):
    x = normalize(x)
    return x[:x.find(re.search(r'(\d{4})', x).group(1))-1]
    
allfiles = sorted([x for x in os.listdir('extracted') if x.endswith('.xlsx')], key=normalize)
files_by_symbol = groupby(allfiles, key=get_hash)
data_by_symbol = {}
for symbol, files in files_by_symbol:
    print(f"Processing symbol: {symbol}")
    data = []
    for file_name in files:
        year = re.search(r'(\d{4})', file_name).group(1)
        wb = openpyxl.load_workbook(os.path.join('extracted', file_name), data_only=True)
        for row in wb._sheets[0]:
            values = [cell.value for cell in row]
            if not any(x for x in values if isinstance(x, float)):
                continue
            if values[0][-1].isdigit():
                values[0] = values[0][:-1]
            line = f'{int(year)+1}0101,' + ','.join(normalize(str(v).strip()) for v in values)
            data.append(line)
    with open(os.path.join(data_dir, f'{symbol}.csv'), 'w') as f:
        f.write('\n'.join(data))