In [1]:
import requests

url = "https://www.ncei.noaa.gov/oa/local-climatological-data/v2/doc/lcdv2-station-list.txt"

response = requests.get(url)
response.raise_for_status()  # Raises an error if the request failed

lines = response.text.strip().split("\n")
print(f"Number of rows: {len(lines)}")


Number of rows: 24072


In [None]:
import pandas as pd
# Parse lines using fixed-width positions
data = []
for line in lines[:]:
    station_id = line[0:11].strip()
    latitude = float(line[11:21].strip())
    longitude = float(line[21:31].strip())
    elevation = float(line[31:41].strip())
    station_name = line[41:].strip()
    data.append([station_id, latitude, longitude, elevation, station_name])

# Create DataFrame
df = pd.DataFrame(data, columns=["Station ID", "Latitude", "Longitude", "Elevation", "Station Name"])
print(df.shape)

In [32]:
df.loc[df['Station ID'] == 'ASN00066062']

Unnamed: 0,Station ID,Latitude,Longitude,Elevation,Station Name
1197,ASN00066062,-33.8607,151.205,39.0,SYDNEY (OBSERVATORY HILL)


In [39]:
import requests
import xml.etree.ElementTree as ET

def sum_tar_gz_sizes(xml_url):
    response = requests.get(xml_url)
    if response.status_code != 200:
        print(f"Failed to get XML from {xml_url}: {response.status_code}")
        return
    
    # Parse XML
    root = ET.fromstring(response.content)
    
    # S3 XML namespace
    ns = {'s3': 'http://s3.amazonaws.com/doc/2006-03-01/'}
    
    total_size = 0
    for content in root.findall('s3:Contents', ns):
        key = content.find('s3:Key', ns).text
        size = int(content.find('s3:Size', ns).text)
        if key.endswith('.tar.gz'):
            print(f"Found: {key} Size: {size}")
            total_size += size
    
    total_gb = total_size / (1024 ** 3)
    print(f"Total size of .tar.gz files: {total_gb:.2f} GB")




Total size of .tar.gz files: 0.00 GB


In [46]:
# Replace with the actual NOAA S3 XML list URL for LCD bucket (example)
xml_listing_url = "https://www.ncei.noaa.gov/oa/local-climatological-data/"

response = requests.get(xml_listing_url)

response.content

b'<?xml version="1.0" encoding="UTF-8"?><ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Tenant>prod</Tenant><Name>datapub-local-climatological-data</Name><Prefix></Prefix><MaxKeys>1000</MaxKeys><IsTruncated>true</IsTruncated><Contents><Key>index.html</Key><LastModified>2023-10-30T18:25:15.188Z</LastModified><ETag>&quot;7a5d43b2e147d6f68e098041c7202fbc&quot;</ETag><Size>33023</Size><StorageClass>STANDARD</StorageClass><Owner><ID>prod$cdoload-prod</ID><DisplayName>CDOLOAD PROD</DisplayName></Owner><Type>Normal</Type></Contents><Contents><Key>v2/access/1790/LCD_ASN00066062_1790.csv</Key><LastModified>2025-05-21T16:31:49.248Z</LastModified><ETag>&quot;8455c46b339f6f8ad90ca5a75cfaeb0f&quot;</ETag><Size>165642</Size><StorageClass>STANDARD</StorageClass><Owner><ID>prod$cdoload-prod</ID><DisplayName>CDOLOAD PROD</DisplayName></Owner><Type>Normal</Type></Contents><Contents><Key>v2/access/1790/LCD_SZI0000LSZG_1790.csv</Key><LastModified>2025-05-21T16:31:49.240Z</LastModified><

In [49]:
import requests
import xml.etree.ElementTree as ET

xml_listing_url = "https://www.ncei.noaa.gov/oa/local-climatological-data/"

response = requests.get(xml_listing_url)
xml_bytes = response.content

# Strip namespaces by re-parsing without namespaces — 
# a helper to remove namespaces from the XML string
def strip_namespace(xml_bytes):
    xml_str = xml_bytes.decode('utf-8')
    import re
    # Remove xmlns declaration and namespace prefixes (like s3:)
    xml_str = re.sub(r'\sxmlns(:\w+)?="[^"]+"', '', xml_str, count=1)
    xml_str = re.sub(r'(<\/?)(\w+:)', r'\1', xml_str)
    return xml_str

xml_clean = strip_namespace(xml_bytes)

root = ET.fromstring(xml_clean)

total_size = 0
for content in root.findall('Contents'):
    key = content.find('Key').text
    size = int(content.find('Size').text)
    if key.endswith('.csv'):
        print(f"{key}: {size} bytes")
        total_size += size

print(f"\nTotal size of all .csv files: {total_size / (1024**3):.3f} GB")


v2/access/1790/LCD_ASN00066062_1790.csv: 165642 bytes
v2/access/1790/LCD_SZI0000LSZG_1790.csv: 492560 bytes
v2/access/1791/LCD_ASN00066062_1791.csv: 142720 bytes
v2/access/1791/LCD_SZI0000LSZG_1791.csv: 497373 bytes
v2/access/1792/LCD_SZI0000LSZG_1792.csv: 498306 bytes
v2/access/1793/LCD_SZI0000LSZG_1793.csv: 459440 bytes
v2/access/1794/LCD_SZI0000LSZG_1794.csv: 497435 bytes
v2/access/1795/LCD_SZI0000LSZG_1795.csv: 496847 bytes
v2/access/1796/LCD_SZI0000LSGG_1796.csv: 498444 bytes
v2/access/1796/LCD_SZI0000LSZG_1796.csv: 447272 bytes
v2/access/1796/LCD_UKA00039131_1796.csv: 469926 bytes
v2/access/1797/LCD_SZI0000LSGG_1797.csv: 497079 bytes
v2/access/1797/LCD_SZI0000LSZG_1797.csv: 10566 bytes
v2/access/1797/LCD_UKA00039131_1797.csv: 482588 bytes
v2/access/1798/LCD_SZI0000LSGG_1798.csv: 455148 bytes
v2/access/1798/LCD_UKA00039131_1798.csv: 481727 bytes
v2/access/1799/LCD_SZI0000LSGG_1799.csv: 496647 bytes
v2/access/1799/LCD_UKA00039131_1799.csv: 483374 bytes
v2/access/1800/LCD_SZI0000LSG

In [42]:
root

<Element '{http://s3.amazonaws.com/doc/2006-03-01/}ListBucketResult' at 0x1380a9e90>

In [74]:
list = """1901.tar.gz	2023-12-22 06:55	83K	 
1902.tar.gz	2023-12-22 06:55	83K	 
1903.tar.gz	2023-12-22 06:55	82K	 
1904.tar.gz	2023-12-22 06:55	81K	 
1905.tar.gz	2023-12-22 06:55	79K	 
1906.tar.gz	2023-12-22 06:55	67K	 
1907.tar.gz	2023-12-22 07:23	66K	 
1908.tar.gz	2023-12-22 07:23	82K	 
1909.tar.gz	2023-12-22 06:55	93K	 
1910.tar.gz	2023-12-23 12:45	94K	 
1911.tar.gz	2023-12-23 12:45	95K	 
1912.tar.gz	2023-12-23 12:45	94K	 
1913.tar.gz	2023-12-23 12:45	106K	 
1914.tar.gz	2023-12-23 12:45	107K	 
1915.tar.gz	2023-12-23 12:45	107K	 
1916.tar.gz	2023-12-23 12:45	26K	 
1917.tar.gz	2023-12-23 12:45	109K	 
1918.tar.gz	2023-12-23 12:45	103K	 
1919.tar.gz	2023-12-23 12:45	95K	 
1920.tar.gz	2023-12-24 21:27	106K	 
1921.tar.gz	2023-12-24 21:27	106K	 
1922.tar.gz	2023-12-24 21:27	105K	 
1923.tar.gz	2023-12-24 21:27	93K	 
1924.tar.gz	2023-12-24 21:27	92K	 
1925.tar.gz	2023-12-24 21:27	94K	 
1926.tar.gz	2023-12-24 21:27	124K	 
1927.tar.gz	2023-12-24 21:27	43K	 
1928.tar.gz	2023-12-24 21:27	93K	 
1929.tar.gz	2023-12-24 21:27	474K	 
1930.tar.gz	2023-12-25 06:24	1.2M	 
1931.tar.gz	2023-12-25 06:23	3.2M	 
1932.tar.gz	2023-12-25 06:24	5.3M	 
1933.tar.gz	2023-12-25 06:24	5.5M	 
1934.tar.gz	2023-12-25 06:25	6.4M	 
1935.tar.gz	2023-12-25 06:26	7.8M	 
1936.tar.gz	2023-12-25 06:26	9.4M	 
1937.tar.gz	2023-12-25 06:27	12M	 
1938.tar.gz	2023-12-25 06:28	9.9M	 
1939.tar.gz	2023-12-25 06:29	10M	 
1940.tar.gz	2023-12-26 13:06	12M	 
1941.tar.gz	2023-12-26 13:07	16M	 
1942.tar.gz	2023-12-26 13:03	30M	 
1943.tar.gz	2023-12-26 13:08	61M	 
1944.tar.gz	2023-12-26 13:04	80M	 
1945.tar.gz	2023-12-26 13:10	84M	 
1946.tar.gz	2023-12-26 13:09	48M	 
1947.tar.gz	2023-12-26 13:12	50M	 
1948.tar.gz	2023-12-26 13:14	103M	 
1949.tar.gz	2023-12-26 13:17	136M	 
1950.tar.gz	2023-12-27 20:40	149M	 
1951.tar.gz	2023-12-27 20:43	156M	 
1952.tar.gz	2023-12-27 20:46	165M	 
1953.tar.gz	2023-12-27 20:49	176M	 
1954.tar.gz	2023-12-27 20:52	181M	 
1955.tar.gz	2023-12-27 20:55	167M	 
1956.tar.gz	2023-12-27 20:59	170M	 
1957.tar.gz	2023-12-27 21:03	195M	 
1958.tar.gz	2023-12-27 21:07	196M	 
1959.tar.gz	2023-12-27 21:12	197M	 
1960.tar.gz	2023-12-27 21:43	202M	 
1961.tar.gz	2023-12-27 21:49	207M	 
1962.tar.gz	2023-12-27 21:56	209M	 
1963.tar.gz	2023-12-27 22:02	207M	 
1964.tar.gz	2023-12-28 23:40	182M	 
1965.tar.gz	2023-12-28 23:44	122M	 
1966.tar.gz	2023-12-28 23:47	125M	 
1967.tar.gz	2023-12-28 23:49	121M	 
1968.tar.gz	2023-12-28 23:52	115M	 
1969.tar.gz	2023-12-28 23:54	151M	 
1970.tar.gz	2023-12-28 23:58	150M	 
1971.tar.gz	2023-12-29 00:02	94M	 
1972.tar.gz	2023-12-29 00:05	51M	 
1973.tar.gz	2023-12-28 23:30	491M	 
1974.tar.gz	2023-12-29 00:06	523M	 
1975.tar.gz	2023-12-29 00:16	545M	 
1976.tar.gz	2023-12-29 00:37	556M	 
1977.tar.gz	2023-12-30 17:44	583M	 
1978.tar.gz	2023-12-30 17:54	603M	 
1979.tar.gz	2024-01-05 08:21	618M	 
1980.tar.gz	2023-12-29 00:26	616M	 
1981.tar.gz	2024-01-05 08:35	633M	 
1982.tar.gz	2023-12-30 18:51	626M	 
1983.tar.gz	2023-12-30 18:04	651M	 
1984.tar.gz	2023-12-30 19:02	665M	 
1985.tar.gz	2023-12-30 19:14	674M	 
1986.tar.gz	2023-12-30 18:15	697M	 
1987.tar.gz	2023-12-30 18:38	714M	 
1988.tar.gz	2023-12-30 18:25	740M	 
1989.tar.gz	2023-12-30 19:26	748M	 
1990.tar.gz	2024-01-01 03:15	791M	 
1991.tar.gz	2024-01-01 03:28	798M	 
1992.tar.gz	2024-01-01 03:39	793M	 
1993.tar.gz	2024-01-21 02:24	788M	 
1994.tar.gz	2024-01-21 02:37	778M	 
1995.tar.gz	2024-01-01 03:02	781M	 
1996.tar.gz	2024-01-06 04:28	816M	 
1997.tar.gz	2024-01-06 04:56	850M	 
1998.tar.gz	2024-01-06 04:43	856M	 
1999.tar.gz	2024-01-06 05:10	1.0G	 
2000.tar.gz	2024-01-09 22:33	1.5G	 
2001.tar.gz	2024-01-09 21:36	1.5G	 
2002.tar.gz	2024-01-09 22:05	1.7G	 
2003.tar.gz	2024-01-09 22:20	1.7G	 
2004.tar.gz	2024-01-09 21:50	1.9G	 
2005.tar.gz	2024-01-12 22:35	2.3G	 
2006.tar.gz	2024-01-13 22:32	2.8G	 
2007.tar.gz	2024-01-13 22:50	3.0G	 
2008.tar.gz	2024-01-13 22:12	3.3G	 
2009.tar.gz	2024-01-14 20:12	3.4G	 
2010.tar.gz	2024-01-14 19:51	3.5G	 
2011.tar.gz	2024-01-14 20:31	3.6G	 
2012.tar.gz	2024-01-15 18:07	3.7G	 
2013.tar.gz	2024-01-15 17:21	3.8G	 
2014.tar.gz	2024-01-15 17:44	3.9G	 
2015.tar.gz	2024-01-16 22:48	4.0G	 
2016.tar.gz	2024-01-16 23:10	4.0G	 
2017.tar.gz	2024-01-16 22:24	4.1G	 
2018.tar.gz	2024-01-18 12:16	4.1G	 
2019.tar.gz	2024-01-18 11:30	4.1G	 
2020.tar.gz	2024-01-19 08:14	4.1G	 
2021.tar.gz	2024-01-20 00:29	4.0G	 
2022.tar.gz	2024-01-20 00:56	4.1G	 
2023.tar.gz	2024-01-11 20:26	4.1G	 
2024.tar.gz	2025-01-24 14:29	4.1G	 
2025.tar.gz	2025-05-25 17:18	1.6G"""

list = list.split("\n")
list = [l.split('\t')[2] for l in list]

total = 0

for size in list:
	if size.endswith('K'):
		total += float(size[:-1]) * 1024
	elif size.endswith('M'):
		total += float(size[:-1]) * 1024 * 1024
	elif size.endswith('G'):
		total += float(size[:-1]) * 1024 * 1024 * 1024

print(f"Total size: {total / (1024 ** 3):.2f} GB")

Total size: 106.69 GB


In [79]:
import requests
from bs4 import BeautifulSoup
import os

# URL to LCDv2 archive
base_url = "https://www.ncei.noaa.gov/data/local-climatological-data/archive/"

# Directory to save files
os.makedirs("lcd_data", exist_ok=True)

# Get HTML content
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find first 10 .tar.gz links
file_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.tar.gz')][:10]

# Download files
for link in file_links:
    filename = link.split('/')[-1]
    file_url = base_url + link
    print(f"Downloading {filename}...")
    r = requests.get(file_url)
    with open(os.path.join("lcd_data", filename), "wb") as f:
        f.write(r.content)

print("Download complete.")

Downloading 1901.tar.gz...
Downloading 1902.tar.gz...
Downloading 1903.tar.gz...
Downloading 1904.tar.gz...
Downloading 1905.tar.gz...
Downloading 1906.tar.gz...
Downloading 1907.tar.gz...
Downloading 1908.tar.gz...
Downloading 1909.tar.gz...
Downloading 1910.tar.gz...
Download complete.


In [78]:
file_links

['1901.tar.gz',
 '1902.tar.gz',
 '1903.tar.gz',
 '1904.tar.gz',
 '1905.tar.gz',
 '1906.tar.gz',
 '1907.tar.gz',
 '1908.tar.gz',
 '1909.tar.gz',
 '1910.tar.gz']

In [81]:
import boto3
import os


BUCKET_NAME = 'de300spring2025'   # Replace with your bucket name
S3_FOLDER = 'MOSES_group/'             # The folder path in S3
LOCAL_DIR = 'lcd_data/'      # Local directory to save files

In [86]:
import os
from dotenv import load_dotenv

def load_aws_credentials(filepath="aws_credentials.env"):
    load_dotenv(dotenv_path=filepath)
    
    aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
    aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
    aws_session_token = os.getenv("AWS_SESSION_TOKEN")
    
    if not all([aws_access_key_id, aws_secret_access_key, aws_session_token]):
        raise ValueError("Missing one or more AWS credential variables.")
    
    return {
        "aws_access_key_id": aws_access_key_id,
        "aws_secret_access_key": aws_secret_access_key,
        "aws_session_token": aws_session_token
    }

creds = load_aws_credentials()

In [None]:
def upload_folder_to_s3_folder(bucket_name, s3_folder, local_dir):
	"""Upload files from a local directory to an S3 folder."""
	s3_resource = boto3.resource('s3',
								  aws_access_key_id=creds['aws_access_key_id'],
								  aws_secret_access_key=creds['aws_secret_access_key'],
								  aws_session_token=creds['aws_session_token'])
	bucket = s3_resource.Bucket(bucket_name)

	for root_dir, _, files in os.walk(local_dir):
		for file in files:
			local_file_path = os.path.join(root_dir, file)
			relative_path = os.path.relpath(local_file_path, local_dir)
			s3_key = os.path.join(s3_folder, relative_path).replace("\\", "/")  # Ensure S3 key uses forward slashes
			bucket.upload_file(local_file_path, s3_key)
			print(f"Uploaded {local_file_path} to s3://{bucket_name}/{s3_key}")

In [None]:
upload_folder_to_s3_folder(BUCKET_NAME, S3_FOLDER, LOCAL_DIR)

Uploaded lcd_data/1903.tar.gz to s3://de300spring2025/MOSES_group/1903.tar.gz
Uploaded lcd_data/1901.tar.gz to s3://de300spring2025/MOSES_group/1901.tar.gz
Uploaded lcd_data/1905.tar.gz to s3://de300spring2025/MOSES_group/1905.tar.gz
Uploaded lcd_data/1909.tar.gz to s3://de300spring2025/MOSES_group/1909.tar.gz
Uploaded lcd_data/1907.tar.gz to s3://de300spring2025/MOSES_group/1907.tar.gz
Uploaded lcd_data/1902.tar.gz to s3://de300spring2025/MOSES_group/1902.tar.gz
Uploaded lcd_data/1910.tar.gz to s3://de300spring2025/MOSES_group/1910.tar.gz
Uploaded lcd_data/1904.tar.gz to s3://de300spring2025/MOSES_group/1904.tar.gz
Uploaded lcd_data/1908.tar.gz to s3://de300spring2025/MOSES_group/1908.tar.gz
Uploaded lcd_data/1906.tar.gz to s3://de300spring2025/MOSES_group/1906.tar.gz


In [97]:
def upload_file_to_s3_folder(bucket_name, s3_folder, file_path):
	"""Upload a single file to an S3 folder."""
	s3_resource = boto3.resource('s3',
								  aws_access_key_id=creds['aws_access_key_id'],
								  aws_secret_access_key=creds['aws_secret_access_key'],
								  aws_session_token=creds['aws_session_token'])
	bucket = s3_resource.Bucket(bucket_name)

	# Extract the file name and construct the S3 key
	file_name = os.path.basename(file_path)
	s3_key = os.path.join(s3_folder, file_name).replace("\\", "/")  # Ensure S3 key uses forward slashes
	# Upload the file
	bucket.upload_file(file_path, s3_key)
	print(f"Uploaded {file_path} to s3://{bucket_name}/{s3_key}")

In [None]:
import requests
from bs4 import BeautifulSoup
import os
import time

# URL to LCDv2 archive
base_url = "https://www.ncei.noaa.gov/data/local-climatological-data/archive/"

# Directory to save files
os.makedirs("lcd_data", exist_ok=True)

# Get HTML content
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find first 10 .tar.gz links
file_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.tar.gz')][20:]

# Download files
for link in file_links:
	filename = link.split('/')[-1]
	file_url = base_url + link
	print(f"Downloading {filename}...")
	r = requests.get(file_url)
	with open(os.path.join("lcd_data", filename), "wb") as f:
		f.write(r.content)
	
	local_path = os.path.join("lcd_data", filename)
	upload_file_to_s3_folder(BUCKET_NAME, S3_FOLDER, local_path)

	time.sleep(1)
	os.remove(local_path)

print("Download complete.")

Downloading 1921.tar.gz...
Uploaded lcd_data/1921.tar.gz to s3://de300spring2025/MOSES_group/1921.tar.gz
Downloading 1922.tar.gz...
Uploaded lcd_data/1922.tar.gz to s3://de300spring2025/MOSES_group/1922.tar.gz
Downloading 1923.tar.gz...
Uploaded lcd_data/1923.tar.gz to s3://de300spring2025/MOSES_group/1923.tar.gz
Downloading 1924.tar.gz...
Uploaded lcd_data/1924.tar.gz to s3://de300spring2025/MOSES_group/1924.tar.gz
Downloading 1925.tar.gz...
Uploaded lcd_data/1925.tar.gz to s3://de300spring2025/MOSES_group/1925.tar.gz
Downloading 1926.tar.gz...
Uploaded lcd_data/1926.tar.gz to s3://de300spring2025/MOSES_group/1926.tar.gz
Downloading 1927.tar.gz...
Uploaded lcd_data/1927.tar.gz to s3://de300spring2025/MOSES_group/1927.tar.gz
Downloading 1928.tar.gz...
Uploaded lcd_data/1928.tar.gz to s3://de300spring2025/MOSES_group/1928.tar.gz
Downloading 1929.tar.gz...
Uploaded lcd_data/1929.tar.gz to s3://de300spring2025/MOSES_group/1929.tar.gz
Downloading 1930.tar.gz...
Uploaded lcd_data/1930.tar.g

In [94]:
upload_file_to_s3_folder(BUCKET_NAME, S3_FOLDER, os.path.join("lcd_data", filename))