In [2]:
import requests
from bs4 import BeautifulSoup
import re
import os
import gzip
from urllib.parse import urljoin
import shutil

In [8]:
url = "https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/"

# Folder where files will be saved
data_dir = os.path.join(os.getcwd(), "data/ncei_noaa")  

def can_download(link):
    if link[-6:] == 'csv.gz' and re.search('.*fatalities.*', link) is None:
        return True
    return False

# Get the page content
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all links ending in .csv
    csv_links = [urljoin(url, a["href"]) for a in soup.find_all("a", href=True) if can_download(a['href'])]

    print(f"Found {len(csv_links)} CSV files. Downloading...")

    # Download each CSV file
    for link in csv_links:
        gz_filename = os.path.join(data_dir, link.split("/")[-1])
        csv_filename = gz_filename[:-3]  # Remove .gz to get .csv filename
        
        # Skip if CSV file already exists (prevents unnecessary re-download)
        if os.path.exists(csv_filename):
            print(f"Skipping {csv_filename} (already extracted)")
            continue
        
        print(f"Downloading {gz_filename} ...")
        file_response = requests.get(link)
        
        with open(gz_filename, "wb") as file:
            file.write(file_response.content)

        print(f"Extracting {gz_filename} ...")

        # Unzip the .gz file
        with gzip.open(gz_filename, "rb") as f_in:
            with open(csv_filename, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out) # takes data from .gz file and puts it into .csv

        print(f"Extracted to {csv_filename}")

        # Optional: Remove the original .gz file after extraction
        os.remove(gz_filename)

    print(f'Download complete. Files are saved in: {data_dir}')

else:
    print(f"Failed to fetch page. Status Code: {response.status_code}")

Found 128 CSV files. Downloading...
Downloading /home/clw009/NaturalDisasterProject/data/ncei_noaa/StormEvents_details-ftp_v1.0_d1950_c20210803.csv.gz ...
Extracting /home/clw009/NaturalDisasterProject/data/ncei_noaa/StormEvents_details-ftp_v1.0_d1950_c20210803.csv.gz ...
Extracted to /home/clw009/NaturalDisasterProject/data/ncei_noaa/StormEvents_details-ftp_v1.0_d1950_c20210803.csv
Downloading /home/clw009/NaturalDisasterProject/data/ncei_noaa/StormEvents_details-ftp_v1.0_d1951_c20210803.csv.gz ...
Extracting /home/clw009/NaturalDisasterProject/data/ncei_noaa/StormEvents_details-ftp_v1.0_d1951_c20210803.csv.gz ...
Extracted to /home/clw009/NaturalDisasterProject/data/ncei_noaa/StormEvents_details-ftp_v1.0_d1951_c20210803.csv
Downloading /home/clw009/NaturalDisasterProject/data/ncei_noaa/StormEvents_details-ftp_v1.0_d1952_c20210803.csv.gz ...
Extracting /home/clw009/NaturalDisasterProject/data/ncei_noaa/StormEvents_details-ftp_v1.0_d1952_c20210803.csv.gz ...
Extracted to /home/clw009/N