# Initialize DVC and Start Tracking Merged Data

- Clean data
    - Drop columns not required for training
    - Drop rows with null valus where it makes sense 
    (river discharge may be NaN where there is no river. It makes sense to keep these rows for the model to learn where rivers are)
- Think about whether or not to have separate notebooks for new data retrievals and prep
- Version Control the data
- Train test splitting
- Version control again??

In [None]:
# Install required packages.
# TODO: Create IBM Cloud Software Configuration for those
!pip install ibm-cos-sdk ibm_watson_studio_pipelines

In [None]:
from botocore.client import Config
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import numpy as np
import pandas as pd

from ibm_watson_studio_pipelines import WSPipelines
import ibm_boto3

import logging
import os, types
import warnings

warnings.filterwarnings("ignore")

In [None]:
!pip install 'dvc[s3]' # dvc[all] alternatively, however, COS is covered by S3

### Setup IBM Cloud and COS Credentials

**Note**: If you are running this notebook outside of a Watson Studio Pipeline execution. Make sure to set the environment variables that the Pipeline environment would have passed to the notebook.
Refer to ```credentials.py```.

In [None]:
# Uncomment this cell and put your credentials in credentials.py to run locally.
from credentials import set_env_variables_for_credentials
set_env_variables_for_credentials()

In [None]:
## Retrieve cos credentials from global pipeline parameters
import json
# Get json from environment and convert to string
project_cos_credentials = json.loads(os.getenv('PROJECT_COS_CREDENTIALS'))
mlops_cos_credentials = json.loads(os.getenv('MLOPS_COS_CREDENTIALS'))

## PROJECT COS 
AUTH_ENDPOINT = project_cos_credentials['AUTH_ENDPOINT']
ENDPOINT_URL = project_cos_credentials['ENDPOINT_URL']
API_KEY_COS = project_cos_credentials['API_KEY']
BUCKET_PROJECT_COS = project_cos_credentials['BUCKET']

## MLOPS COS
ENDPOINT_URL_MLOPS = mlops_cos_credentials['ENDPOINT_URL']
API_KEY_MLOPS = mlops_cos_credentials['API_KEY']
CRN_MLOPS = mlops_cos_credentials['CRN']
BUCKET_MLOPS  = mlops_cos_credentials['BUCKET']

In [None]:
CLOUD_API_KEY = os.getenv("CLOUD_API_KEY")
DATA_FILENAME = os.getenv("serialized_data_filename")

In [None]:
# # @hidden_cell
# CLOUD_API_KEY = ""
# DATA_FILENAME = ""

In [None]:
# Secret to git repository on public git
#***REMOVED***

In [None]:
# @hidden_cell
# NOTE: env set in credentials.py
!git clone $GIT_REPOSITORY

In [None]:
!cd dvc-testing

In [None]:
!cd dvc-testing && pwd

In [None]:
!cd dvc-testing && dvc init

In [None]:
!cd dvc-testing && mkdir data

In [None]:
!pwd

In [None]:
!dvc get $GIT_REPOSITORY data/era5-glofas-merged.pkl -o data/era5-glofas-merged.pkl

In [None]:
!mv era5-glofas-merged.pkl dvc-testing/data

In [None]:
!cd dvc-testing && ls -lh data

In [None]:
!cd dvc-testing && dvc add data/era5-glofas-merged.pkl

In [None]:
# To track the cahnges with git, run:
!git add data/.gitignore data/era5-glofas-merged.pkl.dvc

In [None]:
# To enable auto staging, run:
!dvc config core.autostage true

In [None]:
!ls

In [None]:
!cd dvc-testing && git config --global user.email "ilias.ennmouri@ibm.com"
!cd dvc-testing && git config --global user.name "Ilias Ennmouri"

In [None]:
!cd dvc-testing && git commit -m "Add test climate data"

In [None]:
# Add remote

In [None]:
!cd dvc-testing && dvc remote add -d -f ibm-cos s3://mlops-sustainability-data/

In [None]:
!cd dvc-testing && dvc remote modify ibm-cos endpointurl https://s3.eu-de.cloud-object-storage.appdomain.cloud

In [None]:
!cd dvc-testing && dvc remote modify ibm-cos access_key_id $HMAC_ADMIN_ACCESS_KEY

In [None]:
!cd dvc-testing && dvc remote modify ibm-cos secret_access_key $HMAC_ADMIN_SECRET_ACCESS_KEY

In [None]:
!cd dvc-testing && git commit .dvc/config -m "Try configure remote storage"

In [None]:
!cd dvc-testing && git commit .dvc/config -m "Try configure remote storage"

In [None]:
!cd dvc-testing && dvc push