# Pull Newest Full Data, make Train Test split and Track those.

- Clean data
    - Drop columns not required for training
    - Drop rows with null valus where it makes sense 
    (river discharge may be NaN where there is no river. It makes sense to keep these rows for the model to learn where rivers are)
- Think about whether or not to have separate notebooks for new data retrievals and prep
- Version Control the data
- Train test splitting
- Version control again??

In [2]:
# Install required packages.
# TODO: Create IBM Cloud Software Configuration for those
!pip install ibm-cos-sdk ibm_watson_studio_pipelines 'dvc[s3]' # dvc[all] alternatively, however, COS is covered by S3

zsh:1: /Users/ennmouri/csm/mlops-sustainability-oss/venv/bin/pip: bad interpreter: /Users/ennmouri/csm/mlops-sustainability/venv/bin/python3: no such file or directory

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [3]:
from ibm_watson_studio_pipelines import WSPipelines
import ibm_boto3

from botocore.client import Config
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import numpy as np
import pandas as pd

import pickle
import dvc.api
import io

import logging
import os, types
import warnings

warnings.filterwarnings("ignore")

### Setup IBM Cloud and COS Credentials

**Note**: If you are running this notebook outside of a Watson Studio Pipeline execution. Make sure to set the environment variables that the Pipeline environment would have passed to the notebook.
Refer to ```credentials.py```.

In [23]:
# Uncomment this cell and put your credentials in credentials.py to run locally.
# from credentials2 import set_env_variables_for_credentials
# set_env_variables_for_credentials()

In [24]:
CLOUD_API_KEY = os.getenv("CLOUD_API_KEY")
DATA_FILENAME = os.getenv("serialized_data_filename")
GIT_REPOSITORY = os.getenv("GIT_REPOSITORY")
REPO_NAME = os.getenv("REPO_NAME")

In [26]:
REPO_NAME = "dvc-testing"

### DVC Pull and Deserialize Data

In [6]:
# TODO: Make pipeline param
repo = \
    GIT_REPOSITORY

In [59]:
# Retrieve dataset from tracking information in git. The repository itself contains the remote storage info and credentials.
data = pickle.load(io.BytesIO(dvc.api.read(f"data/{DATA_FILENAME}",repo=repo, mode="rb")))

In [60]:
# Drop rows where at least one col-value is NaN
print(f"Dropped {len(data)-len(data.dropna(axis=0))} rows.")
data = data.dropna(axis=0)

Dropped 1881300 rows.


In [62]:
# E.g. col 'step' has only a single unique value. Its existence has no effect on training is solely a waste of resources.
# Therefore we will drop all cols with that characteristic
for key in data.keys():
    if len(data[key].unique()) < 2:
        print(f"col '{key}' dropped because it bears no more than one unique value.")
        data = data.drop(key, axis=1)

col 'step' dropped because it bears no more than one unique value.
col 'surface' dropped because it bears no more than one unique value.


In [63]:
# Convert non-numeric columns to numeric values
data['time'] = pd.to_datetime(data['time'])  # Convert dates to datetime objects

#data['latitude'] = data['latitude'].astype('category').cat.codes  # Encode coordinates as categorical codes
#data['longitude'] = data['longitude'].astype('category').cat.codes  # Encode coordinates as categorical codes

In [64]:
from sklearn.model_selection import train_test_split

# Assuming your large table is stored in a pandas DataFrame called 'df'
X = data.drop('dis24', axis=1)  # Extract input features by dropping the target column
y = data['dis24']  # Extract the target column


# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
def serialize(obj, target_path):
    try:
        with open(target_path, 'wb') as _file:
            pickle.dump(obj, _file)
    except Exception as e:
        print(e)

In [66]:
train_target = "/data/train_package.pkl"

In [67]:
train_package = {}
train_package["X_train"] = X_train
train_package["y_train"] = y_train

serialize(train_package, f"{REPO_NAME}{train_target}")

In [68]:
test_target = "/data/test_package.pkl"

In [69]:
test_package = {}
test_package["X_test"] = X_test
test_package["y_test"] = y_test

serialize(test_package, f"{REPO_NAME}{test_target}")

###  Setup DVC Situation

Since we assume CPDaaS as environment, we will need to clone the dvc setup repository again.
Run the line shown below.

```
!git clone https://[GIT_TOKEN]@github.com/[GIT_REPOSITORY].git
````


In [70]:
# @hidden_cell
!git clone $GIT_REPOSITORY

fatal: destination path 'dvc-testing' already exists and is not an empty directory.


In [71]:
!cd dvc-testing && dvc add data/train_package.pkl data/test_package.pkl

zsh:1: /Users/ennmouri/csm/mlops-sustainability-oss/venv/bin/dvc: bad interpreter: /Users/ennmouri/csm/mlops-sustainability/venv/bin/python3: no such file or directory
[?25l                                                                          ⠋ Checking graph
  0% Adding...|                                      |0/2 [00:00<?,     ?file/s]
!
  0% Checking cache in '/Users/ennmouri/csm/mlops-sustainability-oss/dvc-testing
                                                                                
!
  0%|          |Transferring                          0/? [00:00<?,     ?file/s]
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s]
                                                                                
!
  0%|          |Checking out data/train_package.pkl   0/? [00:00<?,    ?files/s]
  0%|          |Checking out data/train_package.pkl   0/1 [00:00<?,    ?files/s]
 50% Adding...|████▌    | data/train_package.pkl |1/2 [00:00<00:00,  1.46file/s]


In [72]:
!cd dvc-testing && git add data/.gitignore data/train_package.pkl.dvc data/test_package.pkl.dvc

In [73]:
!cd dvc-testing && git config --global user.email "ilias.ennmouri@ibm.com"
!cd dvc-testing && git config --global user.name "Ilias Ennmouri"

In [74]:
!cd dvc-testing && git commit -m "New train test subsets"

[main b49d9fc] New train test subsets
 2 files changed, 4 insertions(+), 4 deletions(-)


In [75]:
!cd dvc-testing && dvc push && git push

zsh:1: /Users/ennmouri/csm/mlops-sustainability-oss/venv/bin/dvc: bad interpreter: /Users/ennmouri/csm/mlops-sustainability/venv/bin/python3: no such file or directory
  0% Transferring|                                   |0/2 [00:00<?,     ?file/s]
!
  0%|          |/Users/ennmouri/csm/mlops-sustainab0.00/? [00:00<?,        ?B/s]
  0%|          |/Users/ennmouri/csm/mlops-susta0.00/79.6M [00:00<?,        ?B/s]
 50% Transferring|███████████████▌               |1/2 [00:17<00:17, 17.38s/file]
                                                                                
!
  0%|          |/Users/ennmouri/csm/mlops-sustainab0.00/? [00:00<?,        ?B/s]
  0%|          |/Users/ennmouri/csm/mlops-sustai0.00/319M [00:00<?,        ?B/s]
 16%|█▌        |/Users/ennmouri/csm/mlops-s50.0M/319M [00:09<00:49,    5.69MB/s]
 31%|███▏      |/Users/ennmouri/csm/mlops-su100M/319M [00:18<00:39,    5.75MB/s]
 47%|████▋     |/Users/ennmouri/csm/mlops-su150M/319M [00:27<00:30,    5.71MB/s]
 63%|██████▎   |/U

In [76]:
from dvc.api import DVCFileSystem

In [77]:
fs = DVCFileSystem(GIT_REPOSITORY, rev="main")

In [78]:
dvc_tracked = fs.find("/", detail=False, dvc_only=True)

In [79]:
training_tracked = True if train_target in dvc_tracked else False
training_tracked

True

In [80]:
test_tracked = True if test_target in dvc_tracked else False
test_tracked

True

In [42]:
validation_params = {}
validation_params['training_package_tracked'] = training_tracked
validation_params['test_package_tracked'] = test_tracked
validation_params['train_package_dvc_location'] = train_target
validation_params['test_package_dvc_location'] = test_target

In [None]:
pipelines_client = WSPipelines.from_apikey(apikey=CLOUD_API_KEY)
pipelines_client.store_results(validation_params)