# Request Data from Copernicus (GloFAS; ERA5) and Save Merged Data to Cloud Object Storage (COS)

In this initial notebook, we assume that when it is run, there is no existing data and we will download historic data for model training.

In [None]:
!pip install cdsapi netCDF4 xarray ibm_watson_studio_pipelines

In [None]:
from netCDF4 import Dataset
import xarray as xr

import cdsapi

from botocore.client import Config
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import numpy as np
import pandas as pd

from ibm_watson_studio_pipelines import WSPipelines
import ibm_boto3

import logging
import os, types
import warnings
import pickle

warnings.filterwarnings("ignore")

### Setup IBM Cloud and COS Credentials

**Note**: If you are running this notebook outside of a Watson Studio Pipeline execution. Make sure to set the environment variables that the Pipeline environment would have passed to the notebook.
Refer to ```credentials.py```.

In [None]:
# Uncomment this cell and put your credentials in credentials.py to run locally.
from credentials import set_env_variables_for_credentials
set_env_variables_for_credentials()

In [None]:
## Retrieve cos credentials from global pipeline parameters
import json
# Get json from environment and convert to string
project_cos_credentials = json.loads(os.getenv('PROJECT_COS_CREDENTIALS'))
mlops_cos_credentials = json.loads(os.getenv('MLOPS_COS_CREDENTIALS'))

## PROJECT COS 
AUTH_ENDPOINT = project_cos_credentials['AUTH_ENDPOINT']
ENDPOINT_URL = project_cos_credentials['ENDPOINT_URL']
API_KEY_COS = project_cos_credentials['API_KEY']
BUCKET_PROJECT_COS = project_cos_credentials['BUCKET']

## MLOPS COS
ENDPOINT_URL_MLOPS = mlops_cos_credentials['ENDPOINT_URL']
API_KEY_MLOPS = mlops_cos_credentials['API_KEY']
CRN_MLOPS = mlops_cos_credentials['CRN']
BUCKET_MLOPS  = mlops_cos_credentials['BUCKET']

In [None]:
CLOUD_API_KEY = os.getenv('CLOUD_API_KEY')

In [None]:
def save_df_to_cos(df,filename,key):
    """
    
    Save Data in IBM Cloud Object Storage

    
    """

    try:
        #df.to_csv(filename,index=False)
        with open(filename, 'wb') as file:
            pickle.dump(df, file)
        mlops_res = ibm_boto3.resource(
            service_name='s3',
            ibm_api_key_id=API_KEY_MLOPS,
            ibm_service_instance_id=CRN_MLOPS,
            ibm_auth_endpoint=AUTH_ENDPOINT,
            config=Config(signature_version='oauth'),
            endpoint_url=ENDPOINT_URL_MLOPS)

        mlops_res.Bucket(BUCKET_MLOPS).upload_file(filename,key)
        print(f"Dataframe {filename} uploaded successfully")
    except Exception as e:
        print(e)
        print("Dataframe upload for {filename} failed")

def save_binary_to_cos(filename,key):
    """
    
    Save Data in IBM Cloud Object Storage

    
    """

    try:
        mlops_res = ibm_boto3.resource(
            service_name='s3',
            ibm_api_key_id=API_KEY_MLOPS,
            ibm_service_instance_id=CRN_MLOPS,
            ibm_auth_endpoint=AUTH_ENDPOINT,
            config=Config(signature_version='oauth'),
            endpoint_url=ENDPOINT_URL_MLOPS)

        mlops_res.Bucket(BUCKET_MLOPS).upload_file(filename,key)
        print(f"File {filename} uploaded successfully")
    except Exception as e:
        print(e)
        print("File upload for {filename} failed")

def check_if_file_exists(filename):
    mlops_client = ibm_boto3.client(
        service_name='s3',
        ibm_api_key_id=API_KEY_MLOPS,
        ibm_service_instance_id=CRN_MLOPS,
        ibm_auth_endpoint=AUTH_ENDPOINT,
        config=Config(signature_version='oauth'),
        endpoint_url=ENDPOINT_URL_MLOPS)
    
    for key in mlops_client.list_objects(Bucket=BUCKET_MLOPS)['Contents']:
        files = key['Key']
        if files == filename:
            return True
    return False

In [None]:
# Use your Copernicus API_KEY
# @hidden_cell
import os
CDS_USER_ID = os.getenv("CDS_USER_ID")
CDS_API_KEY = os.getenv("CDS_API_KEY")

In [None]:
# Setup copernicus credentials file for cdsapi
import os
with open(os.path.join(os.path.expanduser('~'), '.cdsapirc'), 'w') as f:
    f.write('url: https://cds.climate.copernicus.eu/api/v2\n')
    f.write(f'key: {CDS_USER_ID}:{CDS_API_KEY}')

In [None]:
# Ensure COPERNICUS config is setup at the right place
!cat ../.cdsapirc

In [None]:
copernicus = cdsapi.Client()

In [None]:
europe = [72,25,34,40] # NWSE bounds for Europe
days = [str(i) for i in range(31)]
# months = ['january', 'february', 'march', 'april']
# years = ['2023']

months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
years = ['2023', '2022']

In [None]:
def download_glofas_historic(client, bounds, years, months, days, download_path):
    client.retrieve(
        'cems-glofas-historical',
        {
            'system_version': 'version_3_1',
            'variable': 'river_discharge_in_the_last_24_hours',
            'format': 'netcdf4.zip',
            'hyear': years,
            'hmonth': months,
            'hday': days,
            'hydrological_model': 'lisflood',
            'product_type': 'intermediate',
            'area': bounds,
        },
        f'{download_path}.netcdf4.zip')

In [None]:
# Download ERA5 monthly averaged data from soil temp l1, volumetric soil water l1, total precipitation
def download_era5_historic(client, bounds, years, months, days, download_path):
    client.retrieve(
        'reanalysis-era5-land',
        {
            'variable': [
                'soil_temperature_level_1', 'total_precipitation', 'volumetric_soil_water_layer_1',
            ],
            'year': years,
            # CDS Datasets do not have uniformal requests. Here Months are expected to be e.g. "01" instead of 'january'.
            # Work-around with list comprehension
            # 'month': [str(i) for i in range(len(months))],
            'month': [f'0{i+1}' if i < 9 else str(i+1) for i in range(len(months))],
            'day': [f'0{i+1}' if i < 9 else str(i+1) for i in range(len(days))],
            'time': [
                '00:00'
            ],
            'format': 'netcdf.zip',
            'area': bounds,
        },
        f'{download_path}.netcdf.zip')

In [None]:
download_glofas_historic(copernicus,bounds=europe,years=years,months=months,days=days, download_path="glofas_2023")

In [None]:
download_era5_historic(copernicus,bounds=europe,years=years,months=months,days=days, download_path="era5_2023")

In [None]:
#era5_zip = save_binary_to_cos('era5_2023.netcdf.zip', 'era5_2023.netcdf.zip')
#glofas_zip = save_binary_to_cos('glofas_2023.netcdf.zip', 'glofas_2023.netcdf.zip')

In [None]:
!mkdir era5 && mkdir glofas

In [None]:
!unzip era5_2023.netcdf.zip -d era5 && unzip glofas_2023.netcdf4.zip -d glofas

In [None]:
e5 = xr.open_dataset('era5/data.nc')
f = xr.open_dataset('glofas/data.nc')

## Handle ERA5 Data

**Data**: Total Precipitation; Volumetric Soil Water Layer 1; Soil Temperature Level 1

**Mission**: We requested the above mentioned variables for roughly the same coordinates (variation of .05). Lets have a quick look at the dataset and prepare it for a training split, version control, and more.


In [None]:
e5

In [None]:
# Interpolate to drop 'expver' mask from coordinates
e5_interp = e5.interp_like(f)

In [None]:
e5_interp

In [None]:
# Get rid of that darn supplementary expver dimension's issue (See https://confluence.ecmwf.int/display/CUSF/ERA5+CDS+requests+which+return+a+mixture+of+ERA5+and+ERA5T+data)
e5_combine = e5_interp.sel(expver=1).combine_first(e5_interp.sel(expver=5))
e5_combine.load()
e5_combine

In [None]:
X = e5_combine[['tp', 'stl1', 'swvl1']]
y = f['dis24']


In [None]:
X = e5_combine[['stl1', 'tp', 'swvl1']].to_dataframe()
y = f['dis24'].to_dataframe()

# Reset the index to include the coordinates as columns
X.reset_index(inplace=True)
y.reset_index(inplace=True)

In [None]:
y

In [None]:
# Merge X and y on the common coordinates (time, latitude, longitude)
data = pd.merge(X, y, on=['time', 'latitude', 'longitude'])

In [None]:
data

In [None]:
data['time'].max()

In [None]:
# Pickle and save data

FILENAME = "era5-glofas-merged.pkl"

save_df_to_cos(data, FILENAME, FILENAME)

In [None]:
files_copied_in_cos = check_if_file_exists(FILENAME)
files_copied_in_cos

### Hand-off to Next Pipeline Node

In [None]:
validation_params = {}
validation_params['most_recent_day_in_data'] = str(data['time'].max()).split()[0] # Shows most recent day covered by data ('2023-04-30')
validation_params['serialized_data_filename'] = "era5-glofas-merged.pkl"
validation_params['files_copied_in_cos'] = files_copied_in_cos

In [None]:
pipelines_client = WSPipelines.from_apikey(apikey=CLOUD_API_KEY)
pipelines_client.store_results(validation_params)

### Make train test split

In [None]:
from sklearn.model_selection import train_test_split

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(data['stl1'])

In [None]:
data.isnull().sum()

In [None]:
data_wo_precip = data.dropna(subset=['tp'])

In [None]:
data_wo_precip

In [None]:
data_wo_precip.isnull().sum()

In [None]:
data_wo_2 = data_wo_precip.dropna(subset=['swvl1'])
data_wo_2.isnull().sum()

In [None]:
data.describe()