# Training the SKLearn model

- Clean data
    - Drop columns not required for training
    - Drop rows with null valus where it makes sense 
    (river discharge may be NaN where there is no river. It makes sense to keep these rows for the model to learn where rivers are)
- Think about whether or not to have separate notebooks for new data retrievals and prep
- Version Control the data
- Train test splitting
- Version control again??

In [None]:
# Install required packages.
# TODO: Create IBM Cloud Software Configuration for those
!pip install ibm-cos-sdk ibm_watson_studio_pipelines 'dvc[s3]' # dvc[all] alternatively, however, COS is covered by S3

In [52]:
from ibm_watson_studio_pipelines import WSPipelines
from ibm_watson_machine_learning import APIClient
import ibm_boto3

from botocore.client import Config
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import numpy as np
import pandas as pd

import pickle
import dvc.api
import io

import logging
import os, types
import warnings

warnings.filterwarnings("ignore")

### Setup IBM Cloud and COS Credentials

**Note**: If you are running this notebook outside of a Watson Studio Pipeline execution. Make sure to set the environment variables that the Pipeline environment would have passed to the notebook.
Refer to ```credentials.py```.

In [2]:
# Uncomment this cell and put your credentials in credentials.py to run locally.
from credentials2 import set_env_variables_for_credentials
set_env_variables_for_credentials()

In [3]:
CLOUD_API_KEY = os.getenv("CLOUD_API_KEY")
GIT_REPOSITORY = os.getenv("GIT_REPOSITORY")
train_package_dvc_location = os.getenv("train_package_dvc_location") 
test_package_dvc_location = os.getenv("test_package_dvc_location")

In [4]:
# For testing
train_package_dvc_location = "data/train_package.pkl"
test_package_dvc_location = "data/test_package.pkl"

### 1. Pre-Training: DVC Pull and Deserialize Training Data Package

In [5]:
# TODO: Make pipeline param
repo = \
    GIT_REPOSITORY

In [12]:
# Retrieve dataset from tracking information in git. The repository itself contains the remote storage info and credentials.
train_package = pickle.load(io.BytesIO(dvc.api.read(train_package_dvc_location,repo=repo, mode="rb")))

In [7]:
X_train = train_package['X_train']
y_train = train_package['y_train']

In [8]:
# E.g. col 'step' has only a single unique value. Its existence has no effect on training is solely a waste of resources.
# Therefore we will drop all cols with that characteristic
dropped_cols = []
for key in X_train.keys():
    if len(X_train[key].unique()) < 2:
        dropped_cols.append(key)
        print(f"col '{key}' dropped because it bears no more than one unique value.")
        X_train = X_train.drop(key, axis=1)

col 'step' dropped because it bears no more than one unique value.
col 'surface' dropped because it bears no more than one unique value.


In [9]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=3, verbose=True)

X_train = X_train.apply(pd.to_numeric, errors="coerce")

In [11]:
model.fit(X_train.tail(100000).to_numpy(), y_train.tail(100000).to_numpy())

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    8.4s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:   18.8s finished


In [None]:
# Retrieve test package for brief testing
test_package = pickle.load(io.BytesIO(dvc.api.read(test_package_dvc_location,repo=repo, mode="rb")))

In [15]:
# Make predictions on the testing data
X_test = test_package['X_test']

# Drop columns that were dropped in X_train earlier
X_test = X_test.drop(dropped_cols, axis=1)

# Convert to ensure numeric data (avoid e.g. Timestamp() data type)
X_test = X_test.apply(pd.to_numeric, errors="coerce")

y_pred = model.predict(X_test)

col 'step' dropped because it bears no more than one unique value.
col 'surface' dropped because it bears no more than one unique value.


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    5.4s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:   12.1s finished


In [24]:
y_test = test_package['y_test']

# In-line comparison of actual prediction versus known predictant 
validation_df = pd.DataFrame({'y_pred': y_pred, 'y_validate': y_test})
validation_df

1370311

In [44]:
# Misc testing
# See how many predictions are no more off-validation as 1-25% 
# Filter the DataFrame based on the condition
filtered_df = validation_df[abs(validation_df['y_pred'] - validation_df['y_validate']) <= 0.25 * validation_df['y_pred']]
filtered_df2 = filtered_df[abs(validation_df['y_pred'] - validation_df['y_validate']) > 0.01 * validation_df['y_pred']]

# Print the filtered DataFrame
filtered_df2

Unnamed: 0,y_pred,y_validate
803411,19.224453,21.562500
2116340,4.445625,3.406250
1286202,1.498984,1.515625
601779,6.569219,7.468750
241730,3.078672,3.703125
...,...,...
3809291,2.818437,2.468750
3504059,8.433516,6.343750
2950364,3.147344,2.812500
5589021,1.931250,2.125000


### Check a few metrics

You may want to set a threshold for some metrics in the Watson Studio Pipeline. If so, make sure to pass the value (you want to set a threshold for) with the training_params down below.

In [51]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluate the model's performance using a suitable metric (e.g., mean squared error)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error (MSE):', mse)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error (MAE):', mae)

# Calculate the R-squared score (coefficient of determination)
r2 = r2_score(y_test, y_pred)

# Print the R-squared score
print('R-squared Score:', r2)


Mean Squared Error (MSE): 163751.17759774564
Mean Absolute Error (MAE): 64.21870658695357
R-squared Score: 0.2732707390557916


### Store Model in the Project

In [53]:
MODEL_FILENAME = "regression_model.pkl"

os.environ["MODEL_FILENAME"] = MODEL_FILENAME

with open(MODEL_FILENAME, 'wb') as f:
    pickle.dump(model, f)

In [54]:
WML_CREDENTIALS = {
                   "url": "https://us-south.ml.cloud.ibm.com",
                   "apikey": CLOUD_API_KEY
            }

In [55]:
wml_client = APIClient(WML_CREDENTIALS)

In [56]:
# TODO: Save and Log Models in AI Factsheets

### Track Model with DVC

In [67]:
!echo $MODEL_FILENAME

regression_model.pkl


In [None]:
!git clone $GIT_REPOSITORY

In [59]:
!cd dvc-testing && mkdir model

In [60]:
!mv $MODEL_FILENAME dvc-testing/model/

In [69]:
!cd dvc-testing && dvc add model/$MODEL_FILENAME

zsh:1: /Users/ennmouri/csm/mlops-sustainability-oss/venv/bin/dvc: bad interpreter: /Users/ennmouri/csm/mlops-sustainability/venv/bin/python3: no such file or directory
[?25l                                                                          ⠋ Checking graph
Adding...                                                                       
!
  0% Checking cache in '/Users/ennmouri/csm/mlops-sustainability-oss/dvc-testing
                                                                                
!
  0%|          |Transferring                          0/? [00:00<?,     ?file/s]
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s]
                                                                                
!
  0%|          |Checking out model/regression_model.pk0/? [00:00<?,    ?files/s]
  0%|          |Checking out model/regression_model.pk0/1 [00:00<?,    ?files/s]
100% Adding...|████████████████████████████████████████|1/1 [00:01,  1.66s/file]


In [70]:
!cd dvc-testing && git add model/$MODEL_FILENAME.dvc

fatal: pathspec 'model/.dvc' did not match any files


In [None]:
training_params = {}
training_params['training_completed'] = True
training_params['r2_score'] = r2
training_params['model_filename'] = MODEL_FILENAME

In [None]:
pipelines_client = WSPipelines.from_apikey(apikey=CLOUD_API_KEY)
pipelines_client.store_results(training_params)