# Initialize DVC and Start Tracking Merged Data

- Clean data
    - Drop columns not required for training
    - Drop rows with null valus where it makes sense 
    (river discharge may be NaN where there is no river. It makes sense to keep these rows for the model to learn where rivers are)
- Think about whether or not to have separate notebooks for new data retrievals and prep
- Version Control the data
- Train test splitting
- Version control again??

In [None]:
# Install required packages.
# TODO: Create IBM Cloud Software Configuration for those
!pip install ibm-cos-sdk ibm_watson_studio_pipelines

In [3]:
from botocore.client import Config
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import numpy as np
import pandas as pd

from ibm_watson_studio_pipelines import WSPipelines
import ibm_boto3

import logging
import os, types
import warnings

warnings.filterwarnings("ignore")

In [3]:
!pip install 'dvc[s3]' # dvc[all] alternatively, however, COS is covered by S3

Collecting dvc[s3]
  Downloading dvc-2.57.3-py3-none-any.whl (439 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m439.4/439.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting colorama>=0.3.9 (from dvc[s3])
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting configobj>=5.0.6 (from dvc[s3])
  Downloading configobj-5.0.8-py2.py3-none-any.whl (36 kB)
Collecting distro>=1.3 (from dvc[s3])
  Downloading distro-1.8.0-py3-none-any.whl (20 kB)
Collecting dpath<3,>=2.1.0 (from dvc[s3])
  Downloading dpath-2.1.6-py3-none-any.whl (17 kB)
Collecting dvc-data<0.52,>=0.51.0 (from dvc[s3])
  Downloading dvc_data-0.51.0-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dvc-http>=2.29.0 (from dvc[s3])
  Downloading dvc_http-2.30.2-py3-none-any.whl (12 kB)
Collecting dvc-render<1,>=0.3.1 (from dvc[s3])
  Downloading dvc_render-

### Setup IBM Cloud and COS Credentials

**Note**: If you are running this notebook outside of a Watson Studio Pipeline execution. Make sure to set the environment variables that the Pipeline environment would have passed to the notebook.
Refer to ```credentials.py```.

In [1]:
# Uncomment this cell and put your credentials in credentials.py to run locally.
from credentials import set_env_variables_for_credentials
set_env_variables_for_credentials()

In [4]:
## Retrieve cos credentials from global pipeline parameters
import json
# Get json from environment and convert to string
project_cos_credentials = json.loads(os.getenv('PROJECT_COS_CREDENTIALS'))
mlops_cos_credentials = json.loads(os.getenv('MLOPS_COS_CREDENTIALS'))

## PROJECT COS 
AUTH_ENDPOINT = project_cos_credentials['AUTH_ENDPOINT']
ENDPOINT_URL = project_cos_credentials['ENDPOINT_URL']
API_KEY_COS = project_cos_credentials['API_KEY']
BUCKET_PROJECT_COS = project_cos_credentials['BUCKET']

## MLOPS COS
ENDPOINT_URL_MLOPS = mlops_cos_credentials['ENDPOINT_URL']
API_KEY_MLOPS = mlops_cos_credentials['API_KEY']
CRN_MLOPS = mlops_cos_credentials['CRN']
BUCKET_MLOPS  = mlops_cos_credentials['BUCKET']

In [5]:
CLOUD_API_KEY = os.getenv("CLOUD_API_KEY")
DATA_FILENAME = os.getenv("serialized_data_filename")

In [None]:
# # @hidden_cell
# CLOUD_API_KEY = ""
# DATA_FILENAME = ""

In [None]:
# Secret to git repository on public git
#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

In [6]:
# @hidden_cell
# NOTE: env set in credentials.py
!git clone $GIT_REPOSITORY

Cloning into 'dvc-testing'...
git: 'credential-manager-core' is not a git command. See 'git --help'.
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (16/16), done.[Kjects:  12% (2/16)[K
remote: Total 21 (delta 2), reused 21 (delta 2), pack-reused 0[K
Receiving objects: 100% (21/21), done.
Resolving deltas: 100% (2/2), done.


In [13]:
!cd dvc-testing

In [15]:
!cd dvc-testing && pwd

/Users/ennmouri/csm/mlops-sustainability/dvc-testing


In [16]:
!cd dvc-testing && dvc init

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


In [17]:
!cd dvc-testing && mkdir data

In [1]:
!pwd

/Users/ennmouri/csm/mlops-sustainability


In [7]:
!dvc get $GIT_REPOSITORY data/era5-glofas-merged.pkl -o data/era5-glofas-merged.pkl

  0% Downloading era5-glofas-merged-05132023.pkl|    |0/1 [00:00<?,    ?files/s]
![A
  0%|          |data/era5-glofas-merged-05132023.pk0.00/? [00:00<?,        ?B/s][A
  0%|          |data/era5-glofas-merged-051320230.00/497M [00:00<?,        ?B/s][A
  0%|          |data/era5-glofas-merged-0513816k/497M [00:00<01:08,    7.54MB/s][A
  1%|          |data/era5-glofas-merged-0514.33M/497M [00:00<00:21,    24.1MB/s][A
  1%|          |data/era5-glofas-merged-0516.02M/497M [00:00<00:27,    18.7MB/s][A
  2%|▏         |data/era5-glofas-merged-05111.9M/497M [00:00<00:18,    28.0MB/s][A
  4%|▎         |data/era5-glofas-merged-05118.6M/497M [00:00<00:12,    40.5MB/s][A
  5%|▍         |data/era5-glofas-merged-05122.5M/497M [00:00<00:12,    40.7MB/s][A
  5%|▌         |data/era5-glofas-merged-05126.4M/497M [00:00<00:12,    40.3MB/s][A
  6%|▌         |data/era5-glofas-merged-05130.3M/497M [00:00<00:12,    40.4MB/s][A
  7%|▋         |data/era5-glofas-merged-05134.2M/497M [00:01<00:11,    40.

In [12]:
# !cd dvc-testing && dvc get https://github.com/iterative/dataset-registry get-started/data.xml -o data/data.xml

  0% Downloading data.xml|                           |0/1 [00:00<?,    ?files/s]
!
  0%|          |get-started/data.xml               0.00/? [00:00<?,        ?B/s]
  0%|          |get-started/data.xml           0.00/13.8M [00:00<?,        ?B/s]
  0%|          |get-started/data.xml      67.6k/13.8M [00:00<00:41,     347kB/s]
  1%|          |get-started/data.xml       170k/13.8M [00:00<00:22,     638kB/s]
  2%|▏         |get-started/data.xml       323k/13.8M [00:00<00:14,     984kB/s]
  5%|▍         |get-started/data.xml       646k/13.8M [00:00<00:07,    1.79MB/s]
  9%|▉         |get-started/data.xml      1.29M/13.8M [00:00<00:03,    3.50MB/s]
 19%|█▉        |get-started/data.xml      2.68M/13.8M [00:00<00:01,    7.05MB/s]
 38%|███▊      |get-started/data.xml      5.17M/13.8M [00:00<00:00,    13.0MB/s]
 64%|██████▎   |get-started/data.xml      8.76M/13.8M [00:00<00:00,    20.6MB/s]
100% Downloading data.xml|██████████████████████|1/1 [00:02<00:00,  2.54s/files]
                          

In [20]:
!mv era5-glofas-merged.pkl dvc-testing/data

In [18]:
!cd dvc-testing && ls -lh data

total 0


In [21]:
!cd dvc-testing && dvc add data/era5-glofas-merged.pkl

[?25l                                                                          ⠋ Checking graph
Adding...                                                                       
!
  0% Checking cache in '/Users/ennmouri/csm/mlops-sustainability/dvc-testing/.dv
                                                                                
!
  0%|          |Transferring                          0/? [00:00<?,     ?file/s]
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s]
                                                                                
!
  0%|          |Checking out data/era5-glofas-merged-00/? [00:00<?,    ?files/s]
  0%|          |Checking out data/era5-glofas-merged-00/1 [00:00<?,    ?files/s]
100% Adding...|████████████████████████████████████████|1/1 [00:01,  1.97s/file]

To track the changes with git, run:

	git add data/.gitignore data/era5-glofas-merged-05132023.pkl.dvc

To enable auto staging, run:

	dvc config core.autostage true


In [22]:
# To track the cahnges with git, run:
!git add data/.gitignore data/era5-glofas-merged.pkl.dvc

fatal: not a git repository (or any of the parent directories): .git


In [23]:
# To enable auto staging, run:
!dvc config core.autostage true

ERROR: configuration error - config file error: Not inside a DVC repo


In [24]:
!ls

a0_retrieve_and_prepare_copernicus_data.ipynb
a1_data_prep_and_version_control.ipynb
assets
dvc-testing
era5
era5_2023.netcdf.zip
glofas
glofas_2023.netcdf4.zip
venv


In [25]:
!cd dvc-testing && git config --global user.email "ilias.ennmouri@ibm.com"
!cd dvc-testing && git config --global user.name "Ilias Ennmouri"

In [26]:
!cd dvc-testing && git commit -m "Add test climate data"

[main (root-commit) 1b8974f] Add test climate data
 3 files changed, 6 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore


In [19]:
# Add remote

In [27]:
!cd dvc-testing && dvc remote add -d -f ibm-cos s3://mlops-sustainability-data/

Setting 'ibm-cos' as a default remote.


In [28]:
!cd dvc-testing && dvc remote modify ibm-cos endpointurl https://s3.eu-de.cloud-object-storage.appdomain.cloud

In [29]:
!cd dvc-testing && dvc remote modify ibm-cos access_key_id $HMAC_ADMIN_ACCESS_KEY

In [30]:
!cd dvc-testing && dvc remote modify ibm-cos secret_access_key $HMAC_ADMIN_SECRET_ACCESS_KEY

In [31]:
!cd dvc-testing && git commit .dvc/config -m "Try configure remote storage"

[main f7b96de] Try configure remote storage
 1 file changed, 7 insertions(+)


In [None]:
!cd dvc-testing && git commit .dvc/config -m "Try configure remote storage"

[main f7b96de] Try configure remote storage
 1 file changed, 7 insertions(+)


In [32]:
!cd dvc-testing && dvc push

  0% Transferring|                                   |0/1 [00:00<?,     ?file/s]
!
  0%|          |/Users/ennmouri/csm/mlops-sustainab0.00/? [00:00<?,        ?B/s]
  0%|          |/Users/ennmouri/csm/mlops-sustai0.00/497M [00:00<?,        ?B/s]
 10%|█         |/Users/ennmouri/csm/mlops-s50.0M/497M [00:10<01:30,    5.15MB/s]
 20%|██        |/Users/ennmouri/csm/mlops-su100M/497M [00:19<01:15,    5.53MB/s]
 30%|███       |/Users/ennmouri/csm/mlops-su150M/497M [00:28<01:04,    5.61MB/s]
 40%|████      |/Users/ennmouri/csm/mlops-su200M/497M [00:37<00:55,    5.63MB/s]
 50%|█████     |/Users/ennmouri/csm/mlops-su250M/497M [00:46<00:45,    5.66MB/s]
 60%|██████    |/Users/ennmouri/csm/mlops-su300M/497M [00:56<00:36,    5.59MB/s]
 70%|███████   |/Users/ennmouri/csm/mlops-su350M/497M [01:05<00:27,    5.63MB/s]
 81%|████████  |/Users/ennmouri/csm/mlops-su400M/497M [01:14<00:17,    5.65MB/s]
 91%|█████████ |/Users/ennmouri/csm/mlops-su450M/497M [01:23<00:08,    5.70MB/s]
100% Transferring|████████