# Get Most Recent Date covered in current Dataset state

This notebook

- checks whether there already is data stored in our COS Bucket and tracked by DVC and
- if so, finds the most recent date covered by the data

The most_recent_day covered will be passed on. Based on that, you will have to decide whether or not to pull newer data to supplement the current data.

In [None]:
# Install required packages.
# TODO: Create IBM Cloud Software Configuration for those
!pip install ibm_watson_studio_pipelines cdsapi 'dvc[s3]' # dvc[all] alternatively, however, COS is covered by S3

In [None]:
from ibm_watson_studio_pipelines import WSPipelines

import pandas as pd
import xarray as xr

import ibm_boto3
from botocore.client import Config

import dvc.api
import cdsapi
import pickle
import io

import json
import logging
import os, types
import warnings

warnings.filterwarnings("ignore")

### Setup IBM Cloud and COS Credentials

**Note**: If you are running this notebook outside of a Watson Studio Pipeline execution. Make sure to set the environment variables that the Pipeline environment would have passed to the notebook.
Refer to ```credentials.py```.

In [None]:
# Uncomment this cell and put your credentials in credentials.py to run locally.
from credentials2 import set_env_variables_for_credentials
set_env_variables_for_credentials()

In [None]:
## Retrieve cos credentials from global pipeline parameters

# Get json from environment and convert to string
project_cos_credentials = json.loads(os.getenv('PROJECT_COS_CREDENTIALS'))
mlops_cos_credentials = json.loads(os.getenv('MLOPS_COS_CREDENTIALS'))

## PROJECT COS 
AUTH_ENDPOINT = project_cos_credentials['AUTH_ENDPOINT']
ENDPOINT_URL = project_cos_credentials['ENDPOINT_URL']
API_KEY_COS = project_cos_credentials['API_KEY']
BUCKET_PROJECT_COS = project_cos_credentials['BUCKET']

## MLOPS COS
ENDPOINT_URL_MLOPS = mlops_cos_credentials['ENDPOINT_URL']
API_KEY_MLOPS = mlops_cos_credentials['API_KEY']
CRN_MLOPS = mlops_cos_credentials['CRN']
BUCKET_MLOPS  = mlops_cos_credentials['BUCKET']

In [None]:
CLOUD_API_KEY = os.getenv("CLOUD_API_KEY")
GIT_REPOSITORY = os.getenv("GIT_REPOSITORY")
REPO_NAME = os.getenv("REPO_NAME")

DATA_FILENAME = os.getenv("serialized_data_filename")
MOST_RECENT_DATE = os.getenv("most_recent_date") # Most recent date found in tracked dataset

In [None]:
# TESTING
MOST_RECENT_DATE = '2023-04-30'

In [None]:
def save_df_to_cos(df,filename,key):
    """
    
    Save Data in IBM Cloud Object Storage

    
    """

    try:
        #df.to_csv(filename,index=False)
        with open(filename, 'wb') as file:
            pickle.dump(df, file)
        mlops_res = ibm_boto3.resource(
            service_name='s3',
            ibm_api_key_id=API_KEY_MLOPS,
            ibm_service_instance_id=CRN_MLOPS,
            ibm_auth_endpoint=AUTH_ENDPOINT,
            config=Config(signature_version='oauth'),
            endpoint_url=ENDPOINT_URL_MLOPS)

        mlops_res.Bucket(BUCKET_MLOPS).upload_file(filename,key)
        print(f"Dataframe {filename} uploaded successfully")
    except Exception as e:
        print(e)
        print("Dataframe upload for {filename} failed")


def check_if_file_exists(filename):
    mlops_client = ibm_boto3.client(
        service_name='s3',
        ibm_api_key_id=API_KEY_MLOPS,
        ibm_service_instance_id=CRN_MLOPS,
        ibm_auth_endpoint=AUTH_ENDPOINT,
        config=Config(signature_version='oauth'),
        endpoint_url=ENDPOINT_URL_MLOPS)
    
    for key in mlops_client.list_objects(Bucket=BUCKET_MLOPS)['Contents']:
        files = key['Key']
        if files == filename:
            return True
    return False

### Retrieve today's date and determine difference in days compared to most recent day covered by tracked dataset

In [None]:
from datetime import datetime

date_format = '%Y-%m-%d'

today = datetime.now().date()
today_str = str(today)

In [None]:
today

In [None]:
most_recent_date = datetime.strptime(MOST_RECENT_DATE, date_format).date()
most_recent_date

In [None]:
day_diff = (today - most_recent_date).days
day_diff

In [None]:
# Use your Copernicus API_KEY
# @hidden_cell
CDS_USER_ID = os.getenv("CDS_USER_ID")
CDS_API_KEY = os.getenv("CDS_API_KEY")

In [None]:
# Setup copernicus credentials file for cdsapi
with open(os.path.join(os.path.expanduser('~'), '.cdsapirc'), 'w') as f:
    f.write('url: https://cds.climate.copernicus.eu/api/v2\n')
    f.write(f'key: {CDS_USER_ID}:{CDS_API_KEY}')

In [None]:
# Ensure COPERNICUS config is setup at the right place
!cat ~/.cdsapirc

In [None]:
copernicus = cdsapi.Client()

### (IF day_diff > 7) Get Copernicus Data between most_recent_date and today

In [None]:
europe = [72,25,34,40] # NWSE bounds for Europe

days = [str(i+1) for i in range(31)]
# months = ['january', 'february', 'march', 'april']
# years = ['2023']

all_months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
months = all_months[most_recent_date.month:today.month] # Slice array to get missing months

hours = [ '00:00',]

years = [str(today.year)]

In [None]:
all_months.index("february")

In [None]:
months

In [None]:
[str(all_months.index(month)+1) for month in months]

In [None]:
def get_newest_copernicus_data(client, bounds, years, months, days, download_path):
    ############ GloFAS #############
    glofas_format = ".netcdf4.zip"
    glofas_new_filename = f'glofas-{download_path}{glofas_format}'

    if os.path.exists(glofas_new_filename):
        # Reason to cancel download process if file exists is elaborated where method is invoked.
        print(f"Target filename already exists in target path ({download_path}{glofas_format})... cancelling download")
        exit
    else:
        client.retrieve(
        'cems-glofas-historical',
        {
            'system_version': 'version_3_1',
            'variable': 'river_discharge_in_the_last_24_hours',
            'format': 'netcdf4.zip',
            'hyear': years,
            'hmonth': months,
            'hday': days,
            'hydrological_model': 'lisflood',
            'product_type': 'intermediate',
            'area': bounds,
        },
        glofas_new_filename)

        os.environ['glofas_new_filename'] = glofas_new_filename
        print("Stored GloFAS data")


    ############ ERA5 #############
    era5_format = ".netcdf.zip"
    era5_new_filename = f'era5-{download_path}{era5_format}'


    if os.path.exists(era5_new_filename):
        # Reason to cancel download process if file exists is elaborated where method is invoked.
        print(f"Target filename already exists in target path ({download_path}{era5_format})... cancelling download")
        exit
    else:
        client.retrieve(
                'reanalysis-era5-land',
                {
                    'variable': [
                        'soil_temperature_level_1', 'total_precipitation', 'volumetric_soil_water_layer_1',
                    ],
                    'year': years,
                    # CDS Datasets do not have uniformal requests. Here Months are expected to be e.g. "01" instead of 'january'.
                    # Work-around with list comprehension
                    # 'month': [str(i) for i in range(len(months))],
                    'month': [str(all_months.index(month)+1) for month in months], # converts 'january', 'february' to '1', '2'
                    'day': [f'0{i+1}' if i < 9 else str(i+1) for i in range(len(days))],
                    'time': hours,
                    'format': 'netcdf.zip',
                    'area': bounds,
                },
                era5_new_filename)
        
        os.environ['era5_new_filename'] = era5_new_filename
        print("Stored ERA5 data")

In [None]:
get_newest_copernicus_data(
    copernicus,
    bounds=europe,
    years=years,
    months=months,
    days=days,
    download_path='-'.join(months) # Results in string of hyphen-separated months
)

In [None]:
!ls -l | grep may

### Unpack ERA5/GloFAS data and prep it for merger with dataset

In [None]:
!mkdir era5_new && mkdir glofas_new

In [None]:
!unzip $glofas_new_filename -d glofas_new

In [None]:
!unzip $era5_new_filename -d era5_new

In [None]:
glofas_new = xr.open_dataset("glofas_new/data.nc")
e5_new = xr.open_dataset("era5_new/data.nc")

In [None]:
# Use lat,long from glofas data (almost identical)
e5_new

In [None]:
glofas_new

In [None]:
# Use lat,long from glofas data (almost identical)
# Also ensures that the same time span is used.
e5_interp = e5_new.interp_like(glofas_new)

In [None]:
# Interpolate in case era5 data comes with additional expver mask over coordinates
if "expver" in e5_interp.coords.dims:
    e5_interp = e5_interp.sel(expver=1).combine_first(e5_interp.sel(expver=5))
    e5_interp.load()
    e5_interp

In [None]:
## Joining predictand onto feature y-interpolated table 
# Set features to keep and choose target variable
X = e5_interp.to_dataframe()
y = glofas_new['dis24'].to_dataframe()

# Reset the index to include the coordinates as columns
X.reset_index(inplace=True)
y.reset_index(inplace=True)

In [None]:
X

In [None]:
y

In [None]:
# Merge features and predictand together common coordinates (time, latitude, longitude)
data_new = pd.merge(X, y, on=['time', 'latitude', 'longitude'])
data_new

In [None]:
repo = \
    GIT_REPOSITORY

In [None]:
data = pickle.load(
    io.BytesIO(
        dvc.api.read(
            f"data/era5-glofas-merged.pkl",
            repo=repo, 
            mode="rb"
        )
    )   
)

In [None]:
# Concatenate dataframe2 to dataframe1
concatenated = pd.concat([data, data_new])

# Remove duplicate rows
deduplicated = concatenated.drop_duplicates()

# Reset the index if needed
deduplicated.reset_index(drop=True, inplace=True)

# Print the deduplicated dataframe
deduplicated

In [None]:
deduplicated.dropna(axis=0),

In [None]:
filename = f"updated-{DATA_FILENAME}"

save_df_to_cos(deduplicated, filename, filename)

Clean-up

In [None]:
!rm -rf era5_new && rm -rf glofas_new

In [None]:
!rm -rf $glofas_new_filename && rm -rf $era5_new_filename

### Set-up Credentials for Copernicus API

In [None]:
data_exists_and_newest_date = {}
data_exists_and_newest_date['copied_updated_data'] = check_if_file_exists(filename)
data_exists_and_newest_date['updated_data_filename'] = filename

In [None]:
pipelines_client = WSPipelines.from_apikey(apikey=CLOUD_API_KEY)
pipelines_client.store_results(data_exists_and_newest_date)