# Create `download_links.csv` file
This notebook scrapes the USGS 3DEP Hurricane Florence archive to create a single CSV file with all the download links.

**CONDA ENVIRONMENT**: `arcgispro-py3`

In [None]:
# Import packages
from pathlib import Path
from urllib import request
import pandas as pd

In [None]:
# Paths: Scratch folder (Create if needed)
scratch_folder = Path.cwd().parent/'data'/'scratch'
scratch_folder.mkdir(parents=True,exist_ok='True')

# Paths: download_links.csv file
download_csv = Path.cwd().parent/'data'/'download_links.csv'
if download_csv.exists():
    print(f'{download_csv} exists!')

In [None]:
# %% Set the url to the main page holding the links
url = 'https://rockyweb.usgs.gov/vdelivery/Datasets/Staged/Elevation/LPC/Projects/NC_HurricaneFlorence_2020_D20/'

#Fetch the page & pull the content
response = request.urlopen(url)
webContent = response.read().decode('utf-8')

#Find the lines with the links
link_lines = [line for line in webContent.split('\n') if 'a href="NC_Hur' in line]

#Extract the links into a list
links = [url + line.split('a href="')[1].split('"')[0] for line in link_lines]
print(f'{len(links)} links extracted')

In [None]:
# Fetch the list of links in each folder
print("fetching download links")
verbose = False
for link in links:
    #Get the base name of the file
    folder_name = link.split('/')[-2]
    #Set the output filename
    output_filename = scratch_folder/f'{folder_name}.txt'
    #Fetch the file, if it doesn't exist
    if not Path(output_filename).exists():
        print(f'Downloading {folder_name}...')
        #Set the url
        url = f'{link}0_file_download_links.txt'
        request.urlretrieve(url, output_filename)
    else:
        if verbose:  print(f'{folder_name} already downloaded to {output_filename}.')

In [None]:
#Create a dataframe to hold the filename and links
df = pd.DataFrame(columns=['filename', 'link', 'tile_id'])

#Get a list of files
files = [f for f in scratch_folder.rglob('*.txt')]
i = 0
#Loop through the files
for file in files:
    print(f'Reading {file}...')
    #Read the file
    with open(file, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            tile_id = line.split('/')[-1].split('_')[-1][:-4]
            df.loc[i] = [file.stem, line, tile_id]  #.strip()]
            i += 1

#Export to a file
df.to_csv(download_csv, index=False)
print(f'File saved to {download_csv}')

In [23]:
#Delete the scratch folder
[f.unlink() for f in scratch_folder.glob('*.txt')]
scratch_folder.rmdir()