# Training a RandomForestRegressor

In [None]:
# Install required packages.
# TODO: Create IBM Cloud Software Configuration for those
%pip install ibm-cos-sdk xgboost ibm_watson_studio_pipelines 'dvc[s3]' # dvc[all] alternatively, however, COS is covered by S3

In [2]:
from ibm_watson_studio_pipelines import WSPipelines
from ibm_watson_machine_learning import APIClient
import ibm_boto3
from botocore.client import Config

import pickle
import dvc.api
import io

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from dataclasses import dataclass

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

import logging
import os, types
import warnings

warnings.filterwarnings("ignore")

### Setup IBM Cloud and COS Credentials

**Note**: If you are running this notebook outside of a Watson Studio Pipeline execution. Make sure to set the environment variables that the Pipeline environment would have passed to the notebook.
Refer to ```credentials.py```.

In [3]:
# Uncomment this cell and put your credentials in credentials.py to run locally.
from credentials2 import set_env_variables_for_credentials
set_env_variables_for_credentials()

ImportError: attempted relative import with no known parent package

In [3]:
CLOUD_API_KEY = os.getenv("CLOUD_API_KEY")
GIT_REPOSITORY = os.getenv("GIT_REPOSITORY")
train_package_dvc_location = os.getenv("train_package_dvc_location") 
test_package_dvc_location = os.getenv("test_package_dvc_location")

In [4]:
# For testing
train_package_dvc_location = "data/train_package.pkl"
test_package_dvc_location = "data/test_package.pkl"

### 1. Pre-Training: DVC Pull and Deserialize Training Data Package

In [5]:
# TODO: Make pipeline param
repo = \
    GIT_REPOSITORY

In [6]:
# Retrieve dataset from tracking information in git. The repository itself contains the remote storage info and credentials.
train_package = pickle.load(io.BytesIO(dvc.api.read(train_package_dvc_location,repo=repo, mode="rb")))


In [7]:
X_train = train_package['X_train']
y_train = train_package['y_train'] 

### Train the RF Regressor

In [21]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [16]:
X_train.dtypes

time          datetime64[ns]
latitude             float64
longitude            float64
stl1                 float64
tp                   float64
swvl1                float64
valid_time    datetime64[ns]
dtype: object

In [18]:
import datetime as dt

X_train['time'] = X_train['time'].map(dt.datetime.toordinal)
X_train['valid_time'] = X_train['valid_time'].map(dt.datetime.toordinal)

In [23]:
test_package = pickle.load(io.BytesIO(dvc.api.read(test_package_dvc_location,repo=repo, mode="rb")))
X_test = test_package['X_test']
y_test = test_package['y_test'] 

In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
X_test = X_test.apply(pd.to_numeric, errors="coerce")
#X_test['time'] = X_test['time'].map(dt.datetime.toordinal)
#X_test['valid_time'] = X_test['valid_time'].map(dt.datetime.toordinal)

# Predicting the target values of the test set
y_pred = rf.predict(X_test)

# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
print("\nRMSE: ", rmse)


RMSE:  404.656


In [30]:
with open("rf100.pkl", 'wb') as f:
    pickle.dump(rf, f)

In [31]:
import joblib
from sklearn.ensemble import RandomForestClassifier


# save
joblib.dump(rf, "rf100.joblib")

# load
#loaded_rf = joblib.load("my_random_forest.joblib")

['rf100.joblib']

In [22]:
# Retrieve test package for brief testing
test_package = pickle.load(io.BytesIO(dvc.api.read(test_package_dvc_location,repo=repo, mode="rb")))

In [23]:
# Make predictions on the testing data
X_test = test_package['X_test']

# NOTE: Step no longer necessary
# # Drop columns that were dropped in X_train earlier
# X_test = X_test.drop(dropped_cols, axis=1)

# Convert to ensure numeric data (avoid e.g. Timestamp() data type)
X_test = X_test.apply(pd.to_numeric, errors="coerce")

y_pred = model.predict(X_test)

### Check a few metrics

You may want to set a threshold for some metrics in the Watson Studio Pipeline. If so, make sure to pass the value (you want to set a threshold for) with the training_params down below.

In [26]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate the mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error (MSE):', mse)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error (MAE):', mae)

# Calculate the R-squared score (coefficient of determination)
r2 = r2_score(y_test, y_pred)

# Print the R-squared score
print('R-squared Score:', r2)

Mean Squared Error (MSE): 303412.8
Mean Absolute Error (MAE): 70.44455
R-squared Score: 0.0155279162979014


### Serialize Regressor

In [27]:
MODEL_FILENAME = "xgbr.pkl"

os.environ["MODEL_FILENAME"] = MODEL_FILENAME

with open(MODEL_FILENAME, 'wb') as f:
    pickle.dump(model, f)

### Track Model with DVC

In [None]:
!echo $MODEL_FILENAME

In [None]:
!git clone $GIT_REPOSITORY

In [None]:
!cd dvc-testing && mkdir model

In [None]:
!mv $MODEL_FILENAME dvc-testing/model/

In [None]:
!cd dvc-testing && dvc add model/$MODEL_FILENAME

In [None]:
!cd dvc-testing && git add model/$MODEL_FILENAME.dvc

In [None]:
!cd dvc-testing && git commit -m "New regression model" && git push

In [None]:
!cd dvc-testing && dvc push

In [None]:
training_params = {}
training_params['training_completed'] = True
training_params['r2_score'] = r2
training_params['model_filename'] = MODEL_FILENAME

In [None]:
pipelines_client = WSPipelines.from_apikey(apikey=CLOUD_API_KEY)
pipelines_client.store_results(training_params)