In [3]:
import numpy as np 
import pandas as pd 

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.datasets import make_moons

from cuml.linear_model import (LogisticRegression as LogisticRegression_GPU,
                               LinearRegression as LinearRegression_gpu,
                              Ridge as Ridge_gpu)
from cuml.svm import SVC as SVC_gpu
from cuml.ensemble import RandomForestClassifier as RandomForestClassifier_gpu
from cuml.neighbors import (KNeighborsClassifier as KNeighborsClassifier_gpu,
                            KNeighborsRegressor as KNeighborsRegressor_gpu)

from time import time
from timeit import Timer, timeit

import cutecharts.charts as ctc 

import warnings
warnings.filterwarnings("ignore")

import cuml
print(cuml.__version__)

0.15.0


# Classification

In [4]:
models = []
results_sklearn = []
results_cuml = []

In [5]:
X, y  = datasets.make_classification(n_samples=40000)

In [6]:
X = X.astype(np.float32)
y = y.astype(np.float32)

In [7]:
def train_data(model, X=X, y=y):
    clf = model
    clf.fit(X, y)

In [8]:
def plot(sklearn_time, cuml_time):

    chart = ctc.Bar('Sklearn vs cuml')
    chart.set_options(
        labels=['sklearn', 'cuml'],
        x_label='library',
        y_label='time (s)',
        )

    chart.add_series('time', data=[round(sklearn_time.average,2), round(cuml_time.average,2)])
    return chart
#     chart.render_notebook()


## SVC

In [9]:
clf_svc = SVC(kernel='poly', degree=2, gamma='auto', C=1)
sklearn_time_svc = %timeit -o train_data(clf_svc)

35.8 s ± 200 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
clf_svc = SVC_gpu(kernel='poly', degree=2, gamma='auto', C=1)
cuml_time_svc = %timeit -o train_data(clf_svc)

2 s ± 6.34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
print(f"""Average time of sklearn's {clf_svc.__class__.__name__}""", sklearn_time_svc.average, 's')
print(f"""Average time of cuml's {clf_svc.__class__.__name__}""", cuml_time_svc.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_svc.average/cuml_time_svc.average)

Average time of sklearn's SVC 35.791008955999914 s
Average time of cuml's SVC 1.9953700327142931 s
Ratio between sklearn and cuml is 17.93702840535976


In [83]:
19.611496431714304/1.9953700327142931

9.828501034986914

In [14]:
plot(sklearn_time_svc, cuml_time_svc).render_notebook()

In [45]:
models.append(clf_svc.__class__.__name__)
results_sklearn.append(round(sklearn_time_svc.average,3))
results_cuml.append(round(cuml_time_svc.average,3))

## Random Forest Classifier

In [16]:
clf_rf = RandomForestClassifier(max_features=1.0,
#                    n_bins=8,
                   n_estimators=40)
sklearn_time_rf = %timeit -o train_data(clf_rf)

24 s ± 88.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
clf_rf = RandomForestClassifier_gpu(max_features=1.0,
#                    n_bins=8,
                   n_estimators=40)
cuml_time_rf = %timeit -o train_data(clf_rf)

151 ms ± 8.06 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
print(f"""Average time of sklearn's {clf_rf.__class__.__name__}""", sklearn_time_rf.average, 's')
print(f"""Average time of cuml's {clf_rf.__class__.__name__}""", cuml_time_rf.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_rf.average/cuml_time_rf.average)

Average time of sklearn's RandomForestClassifier 24.006061030143037 s
Average time of cuml's RandomForestClassifier 0.15141178591425808 s
Ratio between sklearn and cuml is 158.54816641379068


In [19]:
plot(sklearn_time_rf, cuml_time_rf).render_notebook()

In [46]:
models.append(clf_rf.__class__.__name__)
results_sklearn.append(round(sklearn_time_rf.average,3))
results_cuml.append(round(cuml_time_rf.average,3))

## Nearest Neighbors Classifier

In [20]:
clf_nn = KNeighborsClassifier(n_neighbors=10)
sklearn_time_nn = %timeit -o train_data(clf_nn)

75.1 ms ± 128 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
clf_nn = KNeighborsClassifier_gpu(n_neighbors=10)
cuml_time_nn = %timeit -o train_data(clf_nn)

1.51 ms ± 8.89 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [22]:
print(f"""Average time of sklearn's {clf_nn.__class__.__name__}""", sklearn_time_nn.average, 's')
print(f"""Average time of cuml's {clf_nn.__class__.__name__}""", cuml_time_nn.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_nn.average/cuml_time_nn.average)

Average time of sklearn's KNeighborsClassifier 0.07511190322854547 s
Average time of cuml's KNeighborsClassifier 0.0015137992111426033 s
Ratio between sklearn and cuml is 49.618141346401956


In [23]:
plot(sklearn_time_nn, cuml_time_nn).render_notebook()

In [47]:
models.append(clf_nn.__class__.__name__)
results_sklearn.append(round(sklearn_time_nn.average,3))
results_cuml.append(round(cuml_time_nn.average,3))

# Make Regression

In [25]:
X, y  = datasets.make_regression(n_samples=40000)

## Linear Regression

In [26]:
lr = LinearRegression(fit_intercept = True, normalize = False,)
sklearn_time_lr = %timeit -o train_data(lr)

6.26 ms ± 46.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
lr = LinearRegression_gpu(fit_intercept = True, normalize = False,)
cuml_time_lr = %timeit -o train_data(lr)

1.62 ms ± 5.25 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [28]:
print(f"""Average time of sklearn's {lr.__class__.__name__}""", sklearn_time_lr.average, 's')
print(f"""Average time of cuml's {lr.__class__.__name__}""", cuml_time_lr.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_lr.average/cuml_time_lr.average)

Average time of sklearn's LinearRegression 0.006258117452855783 s
Average time of cuml's LinearRegression 0.001624685743714768 s
Ratio between sklearn and cuml is 3.8518941137175795


In [29]:
plot(sklearn_time_lr, cuml_time_lr).render_notebook()

In [48]:
models.append(lr.__class__.__name__)
results_sklearn.append(round(sklearn_time_lr.average,3))
results_cuml.append(round(cuml_time_lr.average,3))

## Ridge Regresion

In [9]:
alpha = np.array([1e-5])

ridge = Ridge(alpha = alpha, fit_intercept = True, normalize = False)
sklearn_time_ridge = %timeit -o train_data(ridge)

4.33 ms ± 466 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
ridge = Ridge_gpu(alpha = alpha, fit_intercept = True, normalize = False)
cuml_time_ridge = %timeit -o train_data(ridge)

1.68 ms ± 42.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [32]:
print(f"""Average time of sklearn's {ridge.__class__.__name__}""", sklearn_time_ridge.average, 's')
print(f"""Average time of cuml's {ridge.__class__.__name__}""", cuml_time_ridge.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_ridge.average/cuml_time_ridge.average)

Average time of sklearn's Ridge 0.0050736307028585404 s
Average time of cuml's Ridge 0.0016766798014292103 s
Ratio between sklearn and cuml is 3.02599858275489


In [33]:
plot(sklearn_time_ridge, cuml_time_ridge).render_notebook()

In [49]:
models.append(ridge.__class__.__name__)
results_sklearn.append(round(sklearn_time_ridge.average,3))
results_cuml.append(round(cuml_time_ridge.average,3))

## Nearest Neighbors Regression


In [34]:
X, y = datasets.make_blobs(n_samples=4000, centers=5,
                  n_features=10)

knn = KNeighborsRegressor(n_neighbors=10)

sklearn_time_knn = %timeit -o train_data(knn)

69.4 ms ± 1.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [35]:
knn = KNeighborsRegressor_gpu(n_neighbors=10)

cuml_time_knn = %timeit -o train_data(knn)

965 µs ± 6.06 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [36]:
plot(sklearn_time_knn, cuml_time_knn).render_notebook()

In [50]:
models.append(knn.__class__.__name__)
results_sklearn.append(round(sklearn_time_knn.average,3))
results_cuml.append(round(cuml_time_knn.average,3))

# Comparison table

In [60]:
import datapane as dp 
import pickle5 as pickle 

In [76]:
comparison_table_more_gpu = pd.DataFrame({'sklearn(s)': results_sklearn,
             'cuml(s)': results_cuml},
            index=models)

In [78]:
comparison_table_more_gpu['sklearn/cuml'] = comparison_table_more_gpu.apply(lambda row: row['sklearn(s)']/ row['cuml(s)'], axis=1)

In [79]:
comparison_table_more_gpu

Unnamed: 0,sklearn(s),cuml(s),sklearn/cuml
SVC,35.791,1.995,17.940351
RandomForestClassifier,24.006,0.151,158.980132
KNeighborsClassifier,0.075,0.002,37.5
LinearRegression,0.006,0.002,3.0
Ridge,0.005,0.002,2.5
KNeighborsRegressor,0.069,0.001,69.0


In [80]:
comparison_table = pickle.load(open('comparison_table.pkl', 'rb'))

In [81]:
comparison_table

Unnamed: 0,sklearn(s),cuml(s),sklearn/cuml
SVC,50.243,23.689,2.120942
RandomForestClassifier,29.824,0.443,67.322799
KNeighborsClassifier,0.078,0.004,19.5
LinearRegression,0.005,0.006,0.833333
Ridge,0.021,0.006,3.5
KNeighborsRegressor,0.076,0.002,38.0


In [84]:
dp.Report(
f'''# Alienware M15 - GeForce 2060 and a graphics card's memory of 6.3 GB''',
dp.Table(comparison_table),
f'''# Dell Precision 7740 - Quadro RTX 5000 and a graphics card's memory of 17 GB''',
dp.Table(comparison_table_more_gpu)).publish(name='cuml_sklearn_comparison')

Publishing report and associated data - please wait..
Report successfully published at https://datapane.com/u/khuyentran1401/reports/cuml-sklearn-comparison/
