## Analytics.usa.gov website URL Examples

    'https://analytics.usa.gov/data/live/all-domains-30-days.csv',
    'https://analytics.usa.gov/data/live/top-downloads-yesterday.csv'
    'https://analytics.usa.gov/data/live/top-traffic-sources-30-days.csv'
    'https://analytics.usa.gov/data/live/top-exit-pages-30-days.csv'
    'https://analytics.usa.gov/data/live/all-pages-realtime.csv'
    'https://analytics.usa.gov/data/live/realtime.json'
    'https://analytics.usa.gov/data/live/language.csv'
    'https://analytics.usa.gov/data/live/top-countries-realtime.json'
    'https://analytics.usa.gov/data/live/top-cities-realtime.json'
    'https://analytics.usa.gov/data/live/devices.csv'
    'https://analytics.usa.gov/data/live/device_model.csv'
    
    'https://analytics.usa.gov/data/agency-international-development/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/agriculture/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/commerce/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/defense/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/education/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/energy/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/health-human-services/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/homeland-security/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/housing-urban-development/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/justice/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/labor/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/state/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/transportation/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/veterans-affairs/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/interior/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/treasury/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/environmental-protection-agency/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/executive-office-president/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/general-services-administration/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/national-aeronautics-space-administration/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/national-archives-records-administration/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/national-science-foundation/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/nuclear-regulatory-commission/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/office-personnel-management/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/postal-service/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/small-business-administration/all-domains-30-days.csv'
    'https://analytics.usa.gov/data/social-security-administration/all-domains-30-days.csv'

In [35]:
import os
import requests
from datetime import datetime

In [36]:
url_base = 'https://analytics.usa.gov/data/'

# This is the department dictionary with each department's name and its 3-letter abbreviation used toward the filename
dept_dict = {
    'live' : 'apw',
    'agency-international-development' : 'aid',
    'agriculture' : 'agr',
    'commerce' : 'com',
    'defense' : 'def',
    'education' : 'edu',
    'energy' : 'ene',
    'health-human-services' : 'hhs',
    'homeland-security' : 'hls',
    'housing-urban-development' : 'hud',
    'justice' : 'jus',
    'labor' : 'lab',
    'state' : 'sta',
    'transportation' : 'tra',
    'veterans-affairs' : 'vet',
    'interior' : 'int',
    'treasury' : 'tre',
    'environmental-protection-agency' : 'epa',
    'executive-office-president' : 'eop',
    'general-services-administration' : 'gsa',
    'national-aeronautics-space-administration' : 'nas',
    'national-archives-records-administration' : 'nar',
    'national-science-foundation' : 'nsf',
    'nuclear-regulatory-commission' : 'nrc',
    'office-personnel-management' : 'opm',
    'postal-service' : 'pos',
    'small-business-administration' : 'sba',
    'social-security-administration' : 'ssa'
    }

file_list = [
    'all-domains-30-days.csv',
    'top-downloads-yesterday.csv',
    'top-traffic-sources-30-days.csv',
    'top-exit-pages-30-days.csv',
    'all-pages-realtime.csv',
    'realtime.json',
    'language.csv',
    'top-countries-realtime.json',
    'top-cities-realtime.json',
    'devices.csv',
    'device_model.csv',
    'browsers.csv',
    'os.csv',
    'os-browsers.csv',
    'windows-browsers.csv',
    'windows-ie.csv',
    'screen-size.csv'
    ]

name_change = lambda x: x if x != 'live' else 'all-participating-websites'
url_dictionary = {(name_change(dept),abbr+'-'+fname) : url_base+dept+'/'+fname for dept,abbr in dept_dict.items() for fname in file_list}


In [37]:
print(url_dictionary)

{('all-participating-websites', 'apw-all-domains-30-days.csv'): 'https://analytics.usa.gov/data/live/all-domains-30-days.csv', ('all-participating-websites', 'apw-top-downloads-yesterday.csv'): 'https://analytics.usa.gov/data/live/top-downloads-yesterday.csv', ('all-participating-websites', 'apw-top-traffic-sources-30-days.csv'): 'https://analytics.usa.gov/data/live/top-traffic-sources-30-days.csv', ('all-participating-websites', 'apw-top-exit-pages-30-days.csv'): 'https://analytics.usa.gov/data/live/top-exit-pages-30-days.csv', ('all-participating-websites', 'apw-all-pages-realtime.csv'): 'https://analytics.usa.gov/data/live/all-pages-realtime.csv', ('all-participating-websites', 'apw-realtime.json'): 'https://analytics.usa.gov/data/live/realtime.json', ('all-participating-websites', 'apw-language.csv'): 'https://analytics.usa.gov/data/live/language.csv', ('all-participating-websites', 'apw-top-countries-realtime.json'): 'https://analytics.usa.gov/data/live/top-countries-realtime.json

In [38]:
# datetime object containing current date and time
def create_dir_w_timestamp(dept_dict):
    """
    This function create a new directory with current time stamp and return a string of directory name
    This function also create all sub-directories
    """
    now = datetime.now()
    print("Now :", now)

    # dd/mm/YY H:M:S
    dt_string = now.strftime("%Y%m%d-%H%M")
    print("Date and Time :", dt_string)
    dir_path = 'dataset-'+dt_string
    os.mkdir(dir_path)
    print("Creating a new directory:", dir_path)
    sub_folders = {folder for (folder,fname),url in dept_dict.items()}
    for f in sub_folders:
        sub_path = dir_path+'/'+f
        os.mkdir(sub_path)
        print("Creating a new directory:", sub_path)
    return dir_path

In [39]:
def is_downloadable(url):
    """
    Does the url contain a downloadable resource
    """
    h = requests.head(url, allow_redirects=True)
    header = h.headers
    content_type = header.get('content-type')
    if 'csv' in content_type.lower():
        return True
    if 'json' in content_type.lower():
        return True
    if 'text' in content_type.lower():
        return False
    if 'html' in content_type.lower():
        return False
    return True

In [42]:
def download_dataset(url_dict):
    """
    This function will download all dataset from the URL dictionary into a newly created timestamp directory
    """
    # first create a timestamp directory
    dir_path = create_dir_w_timestamp(url_dict)
    
    # walk through URL dictory, check URL status before downloading each file
    for (folder,filename),url in url_dict.items():
        if is_downloadable(url):
            r = requests.get(url, allow_redirects=True)
            file_path = dir_path+'/'+folder+'/'+filename
            open(file_path, 'wb').write(r.content)
            print(f"Downloading - {url} as {file_path}")
        else:
            print(f"Unable to download - {url}")

'\n    for (folder,filename),url in url_dict.items():\n        if is_downloadable(url):\n            r = requests.get(url, allow_redirects=True)\n            file_path = dir_path+\'/\'+folder+\'/\'+filename\n            open(file_path, \'wb\').write(r.content)\n            print(f"Downloading - {url} as {file_path}")\n        else:\n            print(f"Unable to download - {url}")\n'

In [43]:
download_dataset(url_dictionary)

Now : 2022-07-15 11:10:56.245517
Date and Time : 20220715-1110
Creating a new directory: dataset-20220715-1110
Creating a new directory: dataset-20220715-1110/state
Creating a new directory: dataset-20220715-1110/agriculture
Creating a new directory: dataset-20220715-1110/national-archives-records-administration
Creating a new directory: dataset-20220715-1110/office-personnel-management
Creating a new directory: dataset-20220715-1110/agency-international-development
Creating a new directory: dataset-20220715-1110/justice
Creating a new directory: dataset-20220715-1110/transportation
Creating a new directory: dataset-20220715-1110/education
Creating a new directory: dataset-20220715-1110/housing-urban-development
Creating a new directory: dataset-20220715-1110/general-services-administration
Creating a new directory: dataset-20220715-1110/environmental-protection-agency
Creating a new directory: dataset-20220715-1110/postal-service
Creating a new directory: dataset-20220715-1110/veteran