# Downloading the ClimateNet dataset

In [None]:
import requests, os, bs4
from bs4 import BeautifulSoup

### List all links to NetCDF files at a given url

def list_nc_datasets(index_url):

    # Parse target url
    reqs = requests.get(index_url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    # Find all link tags in the page and list their target href
    urls = [] 

    for link in soup.find_all('a'):
        urls.append(link.get('href'))

    # Keep only links to NetCDF file
    nc_data_urls = [x for x in urls if x.endswith('.nc')]

    return nc_data_urls

def list_climatenet_urls(index_url="https://portal.nersc.gov/project/ClimateNet/climatenet_new/test/"):

    # Parse target url
    reqs = requests.get(index_url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    # Find all link tags in the page and list their target href
    urls = [] 

    for link in soup.find_all('a'):
        urls.append(link.get('href'))

    # Keep only links to NetCDF file
    nc_data_urls = [f"{index_url}/{x}" for x in urls if x.endswith('.nc')]

    return nc_data_urls

### Download a file to Google Drive

def download_file_gdrive(index_url, file_url, dest_dir):

    # Create folder
    os.makedirs('/content/gdrive/My Drive/CS230/Data/'+dest_dir, exist_ok=True)

    # Stream GET request
    r = requests.get(index_url+file_url, stream = True)
    blocks = []  

    # Save the image to folder
    with open(os.path.join('/content/gdrive/My Drive/CS230/Data/'+dest_dir, os.path.basename(file_url)), "wb") as file:

        for block in r.iter_content(chunk_size = None):
            if block:
                blocks.append(block)

        file.write(b''.join(blocks))

        # Display file size
        file.seek(0, os.SEEK_END)
        print ("Download complete: "+file_url+" – Size: "+str(file.tell())+" bytes.")
        file.close()
    
    return

### Download all NetCDF files rom a target url ###

def download_climate_net(index_url, dest_dir):

    nc_data_urls = list_nc_datasets(index_url)

    for i, file_url in enumerate(nc_data_urls):
        print(str(i+1)+"/"+str(len(nc_data_urls)), end=" ")
        download_file_gdrive(index_url, file_url, dest_dir)

    return

def download_urls(url_list, output_dir="/root/data_downloads"):
    for url in url_list:
        response = requests.get(url)
        if response.status_code == 200:
            # Extract the filename from the URL
            filename = url.split("/")[-1]
            output_path = output_dir + "/" + filename

            # Save the response content to a file
            with open(output_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {url}")
        else:
            print(f"Failed to download: {url}")

def download_climatenet_local(index_url="https://portal.nersc.gov/project/ClimateNet/climatenet_new/test/", output_dir="/root/data_downloads"):
    nc_data_urls = list_climatenet_urls(index_url)

    download_urls(nc_data_urls,output_dir)

    return


### Downloading test/train datasets


In [None]:
%alias mkdatadir mkdir -p %l
CLIMATENET_TEST_PATH="/root/data_downloads/climatenet/test"
%mkdatadir $CLIMATENET_TEST_PATH
download_climatenet_local("https://portal.nersc.gov/project/ClimateNet/climatenet_new/test/",CLIMATENET_TEST_PATH)

In [None]:
CLIMATENET_TRAIN_PATH="/root/data_downloads/climatenet/train"
%mkdatadir $CLIMATENET_TRAIN_PATH
download_climatenet_local("https://portal.nersc.gov/project/ClimateNet/climatenet_new/train/", CLIMATENET_TRAIN_PATH)

# Upload to S3

In [None]:

import os
import boto3
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()                    # Set a default S3 bucket
prefix = 'data/climatenet'
print(bucket)

In [None]:
train_data_path = "/data_downloads/climatenet/train"
test_data_path = "/data_downloads/climatenet/test"


def upload(directory, data_prefix):
    for root, dirs, files in os.walk(directory):
        file_num = 1
        for file in files: 
            boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, data_prefix, file)).upload_file(os.path.join(directory, file))
            print(f"{file} file  {file_num} of {len(files)} uploaded") 
            file_num= file_num + 1 


In [None]:
upload(test_data_path, "test")

In [None]:
upload(train_data_path, "train")