# Downloading the ClimateNet dataset

In [18]:
import requests, os, bs4
from bs4 import BeautifulSoup

### List all links to NetCDF files at a given url

def list_nc_datasets(index_url):

    # Parse target url
    reqs = requests.get(index_url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    # Find all link tags in the page and list their target href
    urls = [] 

    for link in soup.find_all('a'):
        urls.append(link.get('href'))

    # Keep only links to NetCDF file
    nc_data_urls = [x for x in urls if x.endswith('.nc')]

    return nc_data_urls

def list_climatenet_urls(index_url="https://portal.nersc.gov/project/ClimateNet/climatenet_new/test/"):

    # Parse target url
    reqs = requests.get(index_url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    # Find all link tags in the page and list their target href
    urls = [] 

    for link in soup.find_all('a'):
        urls.append(link.get('href'))

    # Keep only links to NetCDF file
    nc_data_urls = [f"{index_url}/{x}" for x in urls if x.endswith('.nc')]

    return nc_data_urls

### Download a file to Google Drive

def download_file_gdrive(index_url, file_url, dest_dir):

    # Create folder
    os.makedirs('/content/gdrive/My Drive/CS230/Data/'+dest_dir, exist_ok=True)

    # Stream GET request
    r = requests.get(index_url+file_url, stream = True)
    blocks = []  

    # Save the image to folder
    with open(os.path.join('/content/gdrive/My Drive/CS230/Data/'+dest_dir, os.path.basename(file_url)), "wb") as file:

        for block in r.iter_content(chunk_size = None):
            if block:
                blocks.append(block)

        file.write(b''.join(blocks))

        # Display file size
        file.seek(0, os.SEEK_END)
        print ("Download complete: "+file_url+" – Size: "+str(file.tell())+" bytes.")
        file.close()
    
    return

### Download all NetCDF files rom a target url ###

def download_climate_net(index_url, dest_dir):

    nc_data_urls = list_nc_datasets(index_url)

    for i, file_url in enumerate(nc_data_urls):
        print(str(i+1)+"/"+str(len(nc_data_urls)), end=" ")
        download_file_gdrive(index_url, file_url, dest_dir)

    return

def download_urls(url_list, output_dir="/data_downloads"):
    for url in url_list:
        response = requests.get(url)
        if response.status_code == 200:
            # Extract the filename from the URL
            filename = url.split("/")[-1]
            output_path = output_dir + "/" + filename

            # Save the response content to a file
            with open(output_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {url}")
        else:
            print(f"Failed to download: {url}")

def download_climatenet_local(index_url="https://portal.nersc.gov/project/ClimateNet/climatenet_new/test/", output_dir="/data_downloads"):
    nc_data_urls = list_climatenet_urls(index_url)

    download_urls(nc_data_urls,output_dir)

    return


### Downloading test dataset



In [14]:
download_climatenet_local("https://portal.nersc.gov/project/ClimateNet/climatenet_new/train/", "/data_downloads")

Downloaded: https://portal.nersc.gov/project/ClimateNet/climatenet_new/test//data-2011-06-01-01-1_0.nc
Downloaded: https://portal.nersc.gov/project/ClimateNet/climatenet_new/test//data-2011-06-03-01-1_0.nc
Downloaded: https://portal.nersc.gov/project/ClimateNet/climatenet_new/test//data-2011-06-03-01-1_1.nc
Downloaded: https://portal.nersc.gov/project/ClimateNet/climatenet_new/test//data-2011-06-16-01-1_0.nc
Downloaded: https://portal.nersc.gov/project/ClimateNet/climatenet_new/test//data-2011-07-26-01-1_0.nc
Downloaded: https://portal.nersc.gov/project/ClimateNet/climatenet_new/test//data-2011-07-26-01-1_1.nc
Downloaded: https://portal.nersc.gov/project/ClimateNet/climatenet_new/test//data-2011-07-29-01-1_0.nc
Downloaded: https://portal.nersc.gov/project/ClimateNet/climatenet_new/test//data-2011-08-08-01-1_0.nc
Downloaded: https://portal.nersc.gov/project/ClimateNet/climatenet_new/test//data-2011-08-08-01-1_1.nc
Downloaded: https://portal.nersc.gov/project/ClimateNet/climatenet_new/te

In [None]:
download_climatenet_local("https://portal.nersc.gov/project/ClimateNet/climatenet_new/train/", "/data_downloads/climatenet/train")

# Upload to S3

In [21]:

import os
import boto3
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()                    # Set a default S3 bucket
prefix = 'data/climatenet'
print(bucket)

sagemaker-us-west-2-014690315417


In [22]:
train_data_path = "/data_downloads/climatenet/train"
test_data_path = "/data_downloads/climatenet/test"


def upload(directory, data_prefix):
    for root, dirs, files in os.walk(directory):
        file_num = 1
        for file in files: 
            boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, data_prefix, file)).upload_file(os.path.join(directory, file))
            print(f"{file} file  {file_num} of {len(files)} uploaded") 
            file_num= file_num + 1 


In [23]:
upload(test_data_path, "test")

data-2011-06-01-01-1_0.nc file  1 of 61 uploaded
data-2011-06-03-01-1_0.nc file  2 of 61 uploaded
data-2011-06-03-01-1_1.nc file  3 of 61 uploaded
data-2011-06-16-01-1_0.nc file  4 of 61 uploaded
data-2011-07-26-01-1_0.nc file  5 of 61 uploaded
data-2011-07-26-01-1_1.nc file  6 of 61 uploaded
data-2011-07-29-01-1_0.nc file  7 of 61 uploaded
data-2011-08-08-01-1_0.nc file  8 of 61 uploaded
data-2011-08-08-01-1_1.nc file  9 of 61 uploaded
data-2011-09-06-01-1_0.nc file  10 of 61 uploaded
data-2011-09-06-01-1_1.nc file  11 of 61 uploaded
data-2011-09-09-01-1_0.nc file  12 of 61 uploaded
data-2011-09-09-01-1_1.nc file  13 of 61 uploaded
data-2011-09-09-01-1_2.nc file  14 of 61 uploaded
data-2011-09-09-01-1_3.nc file  15 of 61 uploaded
data-2011-09-12-01-1_0.nc file  16 of 61 uploaded
data-2011-09-30-01-1_0.nc file  17 of 61 uploaded
data-2011-09-30-01-1_1.nc file  18 of 61 uploaded
data-2011-09-30-01-1_2.nc file  19 of 61 uploaded
data-2011-09-30-01-1_3.nc file  20 of 61 uploaded
data-2011

In [25]:
upload(train_data_path, "train")

data-1996-06-09-01-1_0.nc file  1 of 398 uploaded
data-1996-07-11-01-1_0.nc file  2 of 398 uploaded
data-1996-07-18-01-1_0.nc file  3 of 398 uploaded
data-1996-07-18-01-1_1.nc file  4 of 398 uploaded
data-1996-07-18-01-1_2.nc file  5 of 398 uploaded
data-1996-09-01-01-1_0.nc file  6 of 398 uploaded
data-1996-09-01-01-1_1.nc file  7 of 398 uploaded
data-1996-09-12-01-1_0.nc file  8 of 398 uploaded
data-1996-09-26-01-1_0.nc file  9 of 398 uploaded
data-1996-09-26-01-1_1.nc file  10 of 398 uploaded
data-1996-10-03-01-1_0.nc file  11 of 398 uploaded
data-1996-10-03-01-1_1.nc file  12 of 398 uploaded
data-1997-06-01-01-1_0.nc file  13 of 398 uploaded
data-1997-06-02-01-1_0.nc file  14 of 398 uploaded
data-1997-06-05-01-1_0.nc file  15 of 398 uploaded
data-1997-06-18-01-1_0.nc file  16 of 398 uploaded
data-1997-07-09-01-1_0.nc file  17 of 398 uploaded
data-1997-07-09-01-1_1.nc file  18 of 398 uploaded
data-1997-08-14-01-1_0.nc file  19 of 398 uploaded
data-1997-08-14-01-1_1.nc file  20 of 39