# Get Most Recent Date covered in current Dataset state

This notebook

- checks whether there already is data stored in our COS Bucket and tracked by DVC and
- if so, finds the most recent date covered by the data

The most_recent_day covered will be passed on. Based on that, you will have to decide whether or not to pull newer data to supplement the current data.

In [None]:
# Install required packages.
# TODO: Create IBM Cloud Software Configuration for those
!pip install ibm_watson_studio_pipelines 'dvc[s3]' # dvc[all] alternatively, however, COS is covered by S3

In [None]:
from ibm_watson_studio_pipelines import WSPipelines

import pandas as pd

import pickle
import dvc.api
import io

import logging
import os, types
import warnings

warnings.filterwarnings("ignore")

### Setup IBM Cloud and COS Credentials

**Note**: If you are running this notebook outside of a Watson Studio Pipeline execution. Make sure to set the environment variables that the Pipeline environment would have passed to the notebook.
Refer to ```credentials.py```.

In [None]:
# Uncomment this cell and put your credentials in credentials.py to run locally.
from credentials2 import set_env_variables_for_credentials
set_env_variables_for_credentials()

In [None]:
CLOUD_API_KEY = os.getenv("CLOUD_API_KEY")
GIT_REPOSITORY = os.getenv("GIT_REPOSITORY")
REPO_NAME = os.getenv("REPO_NAME")

DATA_FILENAME = os.getenv("serialized_data_filename")
MODEL_FILENAME = os.getenv("model_filename")

In [None]:
REPO_NAME = "dvc-testing"

In [None]:
# TODO: Make pipeline param
repo = \
    GIT_REPOSITORY

### Initialize DVC FileSystem

...and retrieve the paths of tracked objects.

In [None]:
from dvc.api import DVCFileSystem

fs = DVCFileSystem(GIT_REPOSITORY, rev="main")

dvc_tracked = fs.find("/", detail=False, dvc_only=True)


#### Check Dataset Existence

In [None]:
data_path = f"/data/{DATA_FILENAME}"

base_dataset_exists = True if data_path in dvc_tracked else False

#### Check Model Existence

In [None]:
model_path = f"/model/{MODEL_FILENAME}"

model_exists = True if model_path in dvc_tracked else False

### DVC Pull and Deserialize Data

In [None]:
if base_dataset_exists:
    # Retrieve dataset from tracking information in git. The repository itself contains the remote storage info and credentials.
    data = pickle.load(io.BytesIO(dvc.api.read(f"data/{DATA_FILENAME}",repo=repo, mode="rb")))

### Determine most recent date and pass through pipeline

In [None]:
if base_dataset_exists:
    most_recent_date = data['time'].max()
    most_recent_date = str(most_recent_date.date())
else:
    most_recent_date = "n/a"

In [None]:
data_exists_and_newest_date = {}
data_exists_and_newest_date['base_dataset_exists'] = base_dataset_exists
data_exists_and_newest_date['model_exists'] = model_exists

data_exists_and_newest_date['list_dvc_tracked'] = dvc_tracked
data_exists_and_newest_date['most_recent_date'] = most_recent_date

In [None]:
pipelines_client = WSPipelines.from_apikey(apikey=CLOUD_API_KEY)
pipelines_client.store_results(data_exists_and_newest_date)