# Download Data

Download all data provided by the organizers.
There are multiple data repositories, according to a nomenclature defined here https://github.com/ecmwf-lab/climetlab-s2s-ai-challenge.



In [1]:
import bs4
import multiprocessing
import os
import pathlib
import random
import requests
import urllib

from tqdm.notebook import tqdm

INDEX_TRAIN_INPUT = 'https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/training-input/0.3.0/netcdf/index.html'
INDEX_TRAIN_OUTPUT = 'https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/training-output-benchmark/index.html'
#INDEX_TRAIN_REFERENCE = 'https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/training-output-reference/index.html'

INDEX_TEST_INPUT = 'https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-input/0.3.0/netcdf/index.html'
INDEX_TEST_OUTPUT = 'https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-output-benchmark/index.html'
#INDEX_TEST_REFERENCE = 'https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-output-reference/index.html'
#HOME = "C:\\Users\\klow55\\github\\crims2s"
#HOME = "C:\\Users\\kahch\\src\\crims2s\\crims2s"
HOME="D:\\weatherdata"
TARGET_DIR = os.path.join(HOME, 's2s')
#os.makedirs(TARGET_DIR)

In [2]:
def read_index(index_url):
    html = requests.get(index_url).text
    soup = bs4.BeautifulSoup(html, 'html.parser')
    try:
        table = soup.find_all('tbody')[0]
        links = table.find_all('a')
        dataset_files = [a_tag.attrs['href'] for a_tag in links]
    except Exception as e:
        print(f"{e}: {index_url}")
        dataset_files = []
    return dataset_files

In [3]:
index_url=INDEX_TRAIN_REFERENCE
html = requests.get(index_url).text
html

'<?xml version="1.0" encoding="UTF-8"?><Error><Code>AccessDenied</Code><BucketName>s2s-ai-challenge</BucketName><RequestId>tx0000000000000000726bd-0062f466b3-2a4cdd6c-default</RequestId><HostId>2a4cdd6c-default-default</HostId></Error>'

In [3]:
def download_one_dataset_file(file_url):
    """Inspired by https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests."""
    url_path = urllib.parse.urlparse(file_url).path
    paths = url_path.split('/')
    download_dir = os.path.join(TARGET_DIR, paths[3])
    download_path = os.path.join(download_dir, paths[-1])
    os.makedirs(download_dir, exist_ok=True)

    with requests.get(file_url, stream=True) as stream:
        if os.path.isfile(download_path):
            """Ignore file if it already exists and file size is ok."""
            stream_len = int(stream.headers['Content-length'])
            local_len = os.path.getsize(download_path)

            if stream_len == local_len:               
                return download_path
        
        
        with open(download_path, 'wb') as f:
            for chunk in stream.iter_content(chunk_size=8192):
                f.write(chunk)
                
    return download_path

In [3]:
files_to_read = read_index(INDEX_TRAIN_INPUT)
print(len(files_to_read))

3034


In [5]:
dataset_files = []
for index in [INDEX_TRAIN_OUTPUT, INDEX_TRAIN_INPUT, INDEX_TEST_INPUT, INDEX_TEST_OUTPUT]:
#    print(index)
    dataset_files.extend(read_index(index))

In [7]:
len(dataset_files)

3046

In [15]:
dataset_files[0]

'https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/training-output-benchmark/t2m-weeks-34.nc'

In [20]:
urllib.parse.urlparse(dataset_files[0]).path[23:]

'training-output-benchmark/t2m-weeks-34.nc'

In [34]:
download_one_dataset_file(dataset_files[0])

'E:\\weatherdata\\s2s\\training-output-benchmark\\t2m-weeks-34.nc'

In [6]:
with open("D:\\weatherdata\\s2sdatafiles.txt", "wt") as fout:
    for f in dataset_files:
        fout.write(f+'\n')

## Test local file

In [None]:
import xarray as xr

In [None]:
d = xr.open_dataset(local_file)

In [None]:
d

In [None]:
d.lead_time

In [None]:
d.t2m.isel(lead_time=0, forecast_time=2, category=0).plot()

## Batch Download

In [None]:
random.shuffle(dataset_files)

with multiprocessing.Pool(processes=16) as pool:
    for _ in tqdm(pool.imap(download_one_dataset_file, dataset_files), total=len(dataset_files)):
        pass

  0%|          | 0/6123 [00:00<?, ?it/s]

In [None]:
with open("D:\\weatherdata\\s2sdatafiles.txt") as fin:
    for i, line in enumerate(fin):
        if i % 4 != 0:
            continue
        downloaded = download_one_dataset_file(line.strip())
        print(f"{i}:{downloaded}")

0:D:\weatherdata\s2s\training-output-benchmark\t2m-weeks-34.nc
4:D:\weatherdata\s2s\training-output-benchmark\tp-weeks-56.nc
8:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20200116.nc
12:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20200213.nc
16:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20200312.nc
20:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20200409.nc
24:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20200507.nc
28:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20200604.nc
32:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20200702.nc
36:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20200730.nc
40:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20200827.nc
44:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20200924.nc
48:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20201022.nc
52:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20201119.nc
56:D:\weatherdata\s2s\training-input\eccc-hindcast-ci-20201217.nc
60:D:\weatherdata\

488:D:\weatherdata\s2s\training-input\eccc-hindcast-t-20200206.nc
492:D:\weatherdata\s2s\training-input\eccc-hindcast-t-20200305.nc
496:D:\weatherdata\s2s\training-input\eccc-hindcast-t-20200402.nc
500:D:\weatherdata\s2s\training-input\eccc-hindcast-t-20200430.nc
504:D:\weatherdata\s2s\training-input\eccc-hindcast-t-20200528.nc
508:D:\weatherdata\s2s\training-input\eccc-hindcast-t-20200625.nc
512:D:\weatherdata\s2s\training-input\eccc-hindcast-t-20200723.nc
516:D:\weatherdata\s2s\training-input\eccc-hindcast-t-20200820.nc
520:D:\weatherdata\s2s\training-input\eccc-hindcast-t-20200917.nc
524:D:\weatherdata\s2s\training-input\eccc-hindcast-t-20201015.nc
528:D:\weatherdata\s2s\training-input\eccc-hindcast-t-20201112.nc
