In [None]:
!nvidia-smi

Fri Feb 19 19:49:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Prepare the packages

## Install RAPIDS

In [None]:
# Install RAPIDS
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh stable

import sys, os

dist_package_index = sys.path.index('/usr/local/lib/python3.6/dist-packages')
sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.6/site-packages'] + sys.path[dist_package_index:]
sys.path
exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 190 (delta 8), reused 0 (delta 0), pack-reused 171[K
Receiving objects: 100% (190/190), 58.54 KiB | 8.36 MiB/s, done.
Resolving deltas: 100% (70/70), done.
PLEASE READ
********************************************************************************************************
Changes:
1. IMPORTANT CHANGES: RAPIDS on Colab will be pegged to 0.14 Stable until further notice.  This version of RAPIDS, while works, is outdated.  We have alternative solutions, https://app.blazingsql.com, to run the latest versions of RAPIDS
2. Default stable version is now 0.14.  Nightly will redirect to 0.14.
3. You can now declare your RAPIDSAI version as a CLI option and skip the user prompts (ex: '0.14' or '0.15', between 0.13 to 0.14, without the quotes): 
        "!bash rapidsai-csp-utils/colab/rapids-colab.sh <ver

## Install daal4py

In [None]:
!pip install daal4py

Collecting daal4py
  Downloading daal4py-2021.1-1-cp36-cp36m-manylinux1_x86_64.whl (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 18.1 MB/s 
Collecting daal
  Downloading daal-2021.1.2-py2.py3-none-manylinux1_x86_64.whl (248.2 MB)
[K     |████████████████████████████████| 248.2 MB 11 kB/s 
[?25hCollecting dpcpp-cpp-rt
  Downloading dpcpp_cpp_rt-2021.1.2-py2.py3-none-manylinux1_x86_64.whl (155.8 MB)
[K     |████████████████████████████████| 155.8 MB 73 kB/s 
[?25hCollecting tbb==2021.*
  Downloading tbb-2021.1.1-py2.py3-none-manylinux1_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 71.4 MB/s 
[?25hCollecting common-cmplr-lib-rt==2021.*
  Downloading common_cmplr_lib_rt-2021.1.2-py2.py3-none-manylinux1_x86_64.whl (30.6 MB)
[K     |████████████████████████████████| 30.6 MB 76 kB/s 
[?25hCollecting intel-openmp==2021.*
  Downloading intel_openmp-2021.1.2-py2.py3-none-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB

# Random Forest Classifier

In [None]:
import cudf
import numpy as np
import pandas as pd
import pickle

from cuml.ensemble import RandomForestClassifier as curfc
from cuml.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier as skrfc
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

## Define Parameters

In [None]:
# The speedup obtained by using cuML'sRandom Forest implementation
# becomes much higher when using larger datasets. Uncomment and use the n_samples
# value provided below to see the difference in the time required to run
# Scikit-learn's vs cuML's implementation with a large dataset.

# n_samples = 2*17
# n_samples = 2**13
n_samples = 2**14
n_features = 399
n_info = 300
data_type = np.float32

## Generate Data

### Host

In [None]:
%%time
X,y = make_classification(n_samples=n_samples,
                          n_features=n_features,
                          n_informative=n_info,
                          random_state=123, n_classes=2)

X = pd.DataFrame(X.astype(data_type))
# cuML Random Forest Classifier requires the labels to be integers
y = pd.Series(y.astype(np.int32))

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state=0)

CPU times: user 638 ms, sys: 148 ms, total: 786 ms
Wall time: 547 ms


### GPU

In [None]:
%%time
X_cudf_train = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)

y_cudf_train = cudf.Series(y_train.values)

CPU times: user 370 ms, sys: 10 ms, total: 380 ms
Wall time: 381 ms


## Scikit-learn Model

In [None]:
%%time
sk_model = skrfc(n_estimators=40,
                 max_depth=16,
                 max_features=1.0,
                 random_state=10)

sk_model.fit(X_train, y_train)

CPU times: user 2min 33s, sys: 37.9 ms, total: 2min 33s
Wall time: 2min 33s


In [None]:
%%time
sk_predict = sk_model.predict(X_test)
sk_acc = accuracy_score(y_test, sk_predict)

CPU times: user 38.5 ms, sys: 0 ns, total: 38.5 ms
Wall time: 40.1 ms


## cuML Model

In [None]:
%%time
cuml_model = curfc(n_estimators=40,
                   max_depth=16,
                   max_features=1.0,
                   seed=10)

cuml_model.fit(X_cudf_train, y_cudf_train)

  after removing the cwd from sys.path.


CPU times: user 22.3 s, sys: 1.25 s, total: 23.6 s
Wall time: 12.2 s


In [None]:
%%time
fil_preds_orig = cuml_model.predict(X_cudf_test)

fil_acc_orig = accuracy_score(y_test.to_numpy(), fil_preds_orig)

CPU times: user 142 ms, sys: 3 ms, total: 145 ms
Wall time: 146 ms


### Pickle the cuML random forest classification model

In [None]:
filename = 'cuml_random_forest_model.sav'
# save the trained cuml model into a file
pickle.dump(cuml_model, open(filename, 'wb'))
# delete the previous model to ensure that there is no leakage of pointers.
# this is not strictly necessary but just included here for demo purposes.
del cuml_model
# load the previously saved cuml model from a file
pickled_cuml_model = pickle.load(open(filename, 'rb'))

In [None]:
%%time
pred_after_pickling = pickled_cuml_model.predict(X_cudf_test)

fil_acc_after_pickling = accuracy_score(y_test.to_numpy(), pred_after_pickling)

CPU times: user 131 ms, sys: 4 ms, total: 135 ms
Wall time: 136 ms


## daal4py model

In [None]:
from daal4py.sklearn import patch_sklearn
patch_sklearn()

from sklearn.ensemble import RandomForestClassifier as dalrfc

Intel(R) oneAPI Data Analytics Library solvers for sklearn enabled: https://intelpython.github.io/daal4py/sklearn.html


### Intel CPU optimizations patching

In [None]:
%%time
dal_model = dalrfc(n_estimators=40,
                   max_depth=16,
                   max_features=1.0,
                   random_state=10)

dal_model.fit(X_train, y_train)

CPU times: user 1min 37s, sys: 33.7 ms, total: 1min 37s
Wall time: 1min 37s


In [None]:
%%time
dal_predict = dal_model.predict(X_test)
dal_acc = accuracy_score(y_test, dal_predict)

CPU times: user 24.4 ms, sys: 4 µs, total: 24.4 ms
Wall time: 26.1 ms


### Intel CPU/GPU optimizations patching

No required accelerators available in the google colab session. Thus it cannot experiment the performance for oneDAL. 

Required accelerators for oneDAL:


*   Intel Processor Graphics [GEN9](https://software.intel.com/content/dam/develop/external/us/en/documents/the-compute-architecture-of-intel-processor-graphics-gen9-v1d0-166010.pdf) (and higher)
*   FPGA Cards and FPGA Custom Platforms



## Compare Results

In [None]:
print("CUML accuracy of the RF model before pickling: %s" % fil_acc_orig)
print("CUML accuracy of the RF model after pickling: %s" % fil_acc_after_pickling)

CUML accuracy of the RF model before pickling: 0.7888312339782715
CUML accuracy of the RF model after pickling: 0.7888312339782715


In [None]:
print("SKL accuracy: %s" % sk_acc)
print("CUML accuracy before pickling: %s" % fil_acc_orig)
print("oneDAL accuracy: %s" % dal_acc)

SKL accuracy: 0.8104974031448364
CUML accuracy before pickling: 0.7888312339782715
oneDAL accuracy: 0.7958498597145081
