In [9]:
import cudf 
from sklearn import datasets
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.datasets import make_moons

from cuml.linear_model import (LogisticRegression as LogisticRegression_GPU,
                               LinearRegression as LinearRegression_gpu,
                              Ridge as Ridge_gpu)
from cuml.svm import SVC as SVC_gpu
from cuml.ensemble import RandomForestClassifier as RandomForestClassifier_gpu
from cuml.neighbors import (KNeighborsClassifier as KNeighborsClassifier_gpu,
                            KNeighborsRegressor as KNeighborsRegressor_gpu)

from time import time
from timeit import Timer, timeit

import cutecharts.charts as ctc 

import cuml
print(cuml.__version__)

0.15.0


# Classification

In [10]:
X, y  = datasets.make_classification(n_samples=40000)

In [11]:
X = X.astype(np.float32)
y = y.astype(np.float32)

In [12]:
def train_data(model, X=X, y=y):
    clf = model
    clf.fit(X, y)

In [19]:
def plot(sklearn_time, cuml_time):

    chart = ctc.Bar('Sklearn vs cuml')
    chart.set_options(
        labels=['sklearn', 'cuml'],
        x_label='library',
        y_label='time (s)',
        )

    chart.add_series('time', data=[round(sklearn_time.average,2), round(cuml_time.average,2)])
    return chart


## SVC

In [6]:
clf_svc = SVC(kernel='poly', degree=2, gamma='auto', C=1)
sklearn_time_svc = %timeit -o train_data(clf_svc)

34.8 s ± 481 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
clf_svc = SVC_gpu(kernel='poly', degree=2, gamma='auto', C=1)
cuml_time_svc = %timeit -o train_data(clf_svc)

2.83 s ± 9.25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
print(f"""Average time of sklearn's {clf_svc.__class__.__name__}""", sklearn_time_svc.average, 's')
print(f"""Average time of cuml's {clf_svc.__class__.__name__}""", cuml_time_svc.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_svc.average/cuml_time_svc.average)

Average time of sklearn's SVC 34.81425743485712 s
Average time of cuml's SVC 2.8349827045714164 s
Ratio between sklearn and cuml is 12.280236270478492


In [9]:
plot(sklearn_time_svc, cuml_time_svc).render_notebook()

## Random Forest Classifier

In [16]:
clf_rf = RandomForestClassifier(max_features=1.0,
#                    n_bins=8,
                   n_estimators=40)
sklearn_time_rf = %timeit -o train_data(clf_rf)

29.2 s ± 247 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
clf_rf = RandomForestClassifier_gpu(max_features=1.0,
#                    n_bins=8,
                   n_estimators=40)
cuml_time_rf = %timeit -o train_data(clf_rf)

316 ms ± 22.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
print(f"""Average time of sklearn's {clf_rf.__class__.__name__}""", sklearn_time_rf.average, 's')
print(f"""Average time of cuml's {clf_rf.__class__.__name__}""", cuml_time_rf.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_rf.average/cuml_time_rf.average)

Average time of sklearn's RandomForestClassifier 29.213289268571412 s
Average time of cuml's RandomForestClassifier 0.31644228657140566 s
Ratio between sklearn and cuml is 92.31790600773387


In [20]:
plot(sklearn_time_rf, cuml_time_rf).render_notebook()

## Nearest Neighbors Classifier

In [21]:
clf_nn = KNeighborsClassifier(n_neighbors=10)
sklearn_time_nn = %timeit -o train_data(clf_nn)

76.2 ms ± 1.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
clf_nn = KNeighborsClassifier_gpu(n_neighbors=10)
cuml_time_nn = %timeit -o train_data(clf_nn)

2.07 ms ± 162 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [23]:
print(f"""Average time of sklearn's {clf_nn.__class__.__name__}""", sklearn_time_nn.average, 's')
print(f"""Average time of cuml's {clf_nn.__class__.__name__}""", cuml_time_nn.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_nn.average/cuml_time_nn.average)

Average time of sklearn's KNeighborsClassifier 0.0761912703714253 s
Average time of cuml's KNeighborsClassifier 0.0020676839028562558 s
Ratio between sklearn and cuml is 36.8486064364946


In [24]:
plot(sklearn_time_nn, cuml_time_nn).render_notebook()

# Make Regression

In [25]:
X, y  = datasets.make_regression(n_samples=40000)

## Linear Regression

In [26]:
lr = LinearRegression(fit_intercept = True, normalize = False,)
sklearn_time_lr = %timeit -o train_data(lr)

11.2 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
lr = LinearRegression_gpu(fit_intercept = True, normalize = False,)
cuml_time_lr = %timeit -o train_data(lr)

2.46 ms ± 505 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
print(f"""Average time of sklearn's {lr.__class__.__name__}""", sklearn_time_lr.average, 's')
print(f"""Average time of cuml's {lr.__class__.__name__}""", cuml_time_lr.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_lr.average/cuml_time_lr.average)

Average time of sklearn's LinearRegression 0.01119720891857209 s
Average time of cuml's LinearRegression 0.0024577860001175266 s
Ratio between sklearn and cuml is 4.555811172346437


In [29]:
plot(sklearn_time_lr, cuml_time_lr).render_notebook()

## Ridge Regresion

In [30]:
alpha = np.array([1e-5])

ridge = Ridge(alpha = alpha, fit_intercept = True, normalize = False)
sklearn_time_ridge = %timeit -o train_data(ridge)

29.1 ms ± 6.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
ridge = Ridge_gpu(alpha = alpha, fit_intercept = True, normalize = False)
cuml_time_ridge = %timeit -o train_data(ridge)

3.36 ms ± 233 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [32]:
print(f"""Average time of sklearn's {ridge.__class__.__name__}""", sklearn_time_ridge.average, 's')
print(f"""Average time of cuml's {ridge.__class__.__name__}""", cuml_time_ridge.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_ridge.average/cuml_time_ridge.average)

Average time of sklearn's Ridge 0.029140467057140993 s
Average time of cuml's Ridge 0.0033603680528572766 s
Ratio between sklearn and cuml is 8.671808146837737


In [33]:
plot(sklearn_time_ridge, cuml_time_ridge).render_notebook()

## Nearest Neighbors Regression


In [34]:
X, y = datasets.make_blobs(n_samples=4000, centers=5,
                  n_features=10)

knn = KNeighborsRegressor(n_neighbors=10)

sklearn_time_knn = %timeit -o train_data(knn)

79.1 ms ± 3.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [35]:
knn = KNeighborsRegressor_gpu(n_neighbors=10)

cuml_time_knn = %timeit -o train_data(knn)

1.61 ms ± 141 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [36]:
plot(sklearn_time_knn, cuml_time_knn).render_notebook()