# Training the SKLearn model

- Clean data
    - Drop columns not required for training
    - Drop rows with null valus where it makes sense 
    (river discharge may be NaN where there is no river. It makes sense to keep these rows for the model to learn where rivers are)
- Think about whether or not to have separate notebooks for new data retrievals and prep
- Version Control the data
- Train test splitting
- Version control again??

In [None]:
# Install required packages.
# TODO: Create IBM Cloud Software Configuration for those
%pip install ibm-cos-sdk xgboost ibm_watson_studio_pipelines 'dvc[s3]' # dvc[all] alternatively, however, COS is covered by S3

In [1]:
from ibm_watson_studio_pipelines import WSPipelines
from ibm_watson_machine_learning import APIClient
import ibm_boto3

from botocore.client import Config
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import numpy as np
import pandas as pd

import pickle
import dvc.api
import io

import logging
import os, types
import warnings

warnings.filterwarnings("ignore")

### Setup IBM Cloud and COS Credentials

**Note**: If you are running this notebook outside of a Watson Studio Pipeline execution. Make sure to set the environment variables that the Pipeline environment would have passed to the notebook.
Refer to ```credentials.py```.

In [2]:
# Uncomment this cell and put your credentials in credentials.py to run locally.
from credentials import set_env_variables_for_credentials
set_env_variables_for_credentials()

In [3]:
CLOUD_API_KEY = os.getenv("CLOUD_API_KEY")
GIT_REPOSITORY = os.getenv("GIT_REPOSITORY")
train_package_dvc_location = os.getenv("train_package_dvc_location") 
test_package_dvc_location = os.getenv("test_package_dvc_location")

In [4]:
# For testing
train_package_dvc_location = "data/train_package.pkl"
test_package_dvc_location = "data/test_package.pkl"

### 1. Pre-Training: DVC Pull and Deserialize Training Data Package

In [5]:
# TODO: Make pipeline param
repo = \
    GIT_REPOSITORY

In [6]:
# Retrieve dataset from tracking information in git. The repository itself contains the remote storage info and credentials.
train_package = pickle.load(io.BytesIO(dvc.api.read(train_package_dvc_location,repo=repo, mode="rb")))


In [7]:
X_train = train_package['X_train']
y_train = train_package['y_train'] 

In [11]:
X_train.head(20)

Unnamed: 0,time,latitude,longitude,stl1,tp,swvl1,valid_time
383418,2023-01-07,49.05,32.35,273.509262,0.001308243,0.381918,2023-01-08
1672526,2023-01-29,41.35,31.95,278.198369,0.01080334,0.407033,2023-01-30
4610241,2023-03-22,56.85,33.05,273.563887,0.0006858681,0.39603,2023-03-23
1444657,2023-01-25,38.45,29.85,273.363105,3.72529e-09,0.30604,2023-01-26
6471636,2023-04-24,54.65,33.35,279.434854,1.430511e-06,0.279207,2023-04-25
5984724,2023-04-16,69.35,27.75,271.998772,1.430511e-06,0.229226,2023-04-17
807335,2023-01-14,37.55,31.25,275.813488,0.004128022,0.386734,2023-01-15
3339121,2023-02-28,52.75,38.65,271.324608,2.496876e-05,0.397059,2023-03-01
4557906,2023-03-21,53.05,28.35,274.877838,1.430511e-06,0.37775,2023-03-22
4585070,2023-03-21,35.25,39.15,281.84175,0.01093923,0.341394,2023-03-22


In [12]:
y_train

383418      0.601562
1672526     0.718750
4610241     0.437500
1444657     0.171875
6471636     0.468750
             ...    
2123762    19.265625
3030822     0.718750
6790267     0.468750
5948242     0.937500
2292072     0.437500
Name: dis24, Length: 3976201, dtype: float32

In [20]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

### Train the RF Classifier

In [21]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [16]:
X_train.dtypes

time          datetime64[ns]
latitude             float64
longitude            float64
stl1                 float64
tp                   float64
swvl1                float64
valid_time    datetime64[ns]
dtype: object

In [18]:
import datetime as dt

X_train['time'] = X_train['time'].map(dt.datetime.toordinal)
X_train['valid_time'] = X_train['valid_time'].map(dt.datetime.toordinal)

In [23]:
test_package = pickle.load(io.BytesIO(dvc.api.read(test_package_dvc_location,repo=repo, mode="rb")))
X_test = test_package['X_test']
y_test = test_package['y_test'] 

In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
X_test = X_test.apply(pd.to_numeric, errors="coerce")
#X_test['time'] = X_test['time'].map(dt.datetime.toordinal)
#X_test['valid_time'] = X_test['valid_time'].map(dt.datetime.toordinal)

# Predicting the target values of the test set
y_pred = rf.predict(X_test)

# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
print("\nRMSE: ", rmse)


RMSE:  404.656


In [30]:
with open("rf100.pkl", 'wb') as f:
    pickle.dump(rf, f)

In [31]:
import joblib
from sklearn.ensemble import RandomForestClassifier


# save
joblib.dump(rf, "rf100.joblib")

# load
#loaded_rf = joblib.load("my_random_forest.joblib")

['rf100.joblib']

In [20]:
import xgboost as xgb

# Define the hyperparameters for XGBRegressor
params = {
    'objective': 'reg:squarederror',  # Objective function for regression
    'learning_rate': 0.001,             # Learning rate
    'max_depth': 4,                   # Maximum depth of each tree
    'n_estimators': 500,              # Number of trees (boosting rounds)
    'subsample': 0.6,                 # Subsample ratio of the training instances
    'colsample_bytree': 0.6,          # Subsample ratio of columns when constructing each tree
    'gamma': 0.1,                     # Minimum loss reduction required to make a further partition on a leaf node
    'reg_alpha': 0.25,                 # L1 regularization term on weights
    'reg_lambda': 0.25,                # L2 regularization term on weights
    'random_state': 42                # Random seed for reproducibility
}

# Create an instance of XGBRegressor
model = xgb.XGBRegressor(**params)

X_train = X_train.apply(pd.to_numeric, errors="coerce")

model.fit(X_train.to_numpy(), y_train.to_numpy())

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.6, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.1, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.001, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=4, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=500, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...)

In [21]:
model.score(X_train.tail(2000000).to_numpy(), y_train.tail(2000000).to_numpy())

0.016321689875047785

In [22]:
# Retrieve test package for brief testing
test_package = pickle.load(io.BytesIO(dvc.api.read(test_package_dvc_location,repo=repo, mode="rb")))

In [23]:
# Make predictions on the testing data
X_test = test_package['X_test']

# NOTE: Step no longer necessary
# # Drop columns that were dropped in X_train earlier
# X_test = X_test.drop(dropped_cols, axis=1)

# Convert to ensure numeric data (avoid e.g. Timestamp() data type)
X_test = X_test.apply(pd.to_numeric, errors="coerce")

y_pred = model.predict(X_test)

In [24]:
y_test = test_package['y_test']

# In-line comparison of actual prediction versus known predictant 
validation_df = pd.DataFrame({'y_pred': y_pred, 'y_validate': y_test})
validation_df

Unnamed: 0,y_pred,y_validate
196490,10.540028,2.078125
1695950,6.751508,0.156250
3171536,38.793026,0.500000
3228297,50.555542,44.000000
3629990,10.778741,0.562500
...,...,...
704725,5.338154,0.625000
1001508,2.343091,0.015625
880876,11.156476,0.343750
6754564,27.448526,0.781250


In [25]:
# Misc testing
# See how many predictions are off by no more than 1-25% 
# Filter the DataFrame based on the condition
filtered_df = validation_df[abs(validation_df['y_pred'] - validation_df['y_validate']) <= 0.25 * validation_df['y_pred']]
filtered_df2 = filtered_df[abs(validation_df['y_pred'] - validation_df['y_validate']) > 0.01 * validation_df['y_pred']]

# Print the filtered DataFrame
filtered_df2

Unnamed: 0,y_pred,y_validate
3228297,50.555542,44.000000
198564,20.518646,24.382812
5833072,33.486599,38.625000
3860088,13.480683,11.687500
2877083,38.576653,28.984375
...,...,...
1944244,11.531141,8.953125
1558944,17.632755,17.140625
1292283,3.071134,2.875000
4230409,16.176872,12.437500


### Check a few metrics

You may want to set a threshold for some metrics in the Watson Studio Pipeline. If so, make sure to pass the value (you want to set a threshold for) with the training_params down below.

In [26]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate the mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error (MSE):', mse)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error (MAE):', mae)

# Calculate the R-squared score (coefficient of determination)
r2 = r2_score(y_test, y_pred)

# Print the R-squared score
print('R-squared Score:', r2)

Mean Squared Error (MSE): 303412.8
Mean Absolute Error (MAE): 70.44455
R-squared Score: 0.0155279162979014


### Serialize Regressor

In [27]:
MODEL_FILENAME = "xgbr.pkl"

os.environ["MODEL_FILENAME"] = MODEL_FILENAME

with open(MODEL_FILENAME, 'wb') as f:
    pickle.dump(model, f)

### Track Model with DVC

In [None]:
!echo $MODEL_FILENAME

In [None]:
!git clone $GIT_REPOSITORY

In [None]:
!cd dvc-testing && mkdir model

In [None]:
!mv $MODEL_FILENAME dvc-testing/model/

In [None]:
!cd dvc-testing && dvc add model/$MODEL_FILENAME

In [None]:
!cd dvc-testing && git add model/$MODEL_FILENAME.dvc

In [None]:
!cd dvc-testing && git commit -m "New regression model" && git push

In [None]:
!cd dvc-testing && dvc push

In [None]:
training_params = {}
training_params['training_completed'] = True
training_params['r2_score'] = r2
training_params['model_filename'] = MODEL_FILENAME

In [None]:
pipelines_client = WSPipelines.from_apikey(apikey=CLOUD_API_KEY)
pipelines_client.store_results(training_params)