In [106]:
import numpy as np 
import pandas as pd 

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.datasets import make_moons

from cuml.linear_model import (LogisticRegression as LogisticRegression_GPU,
                               LinearRegression as LinearRegression_gpu,
                              Ridge as Ridge_gpu)
from cuml.svm import SVC as SVC_gpu
from cuml.ensemble import RandomForestClassifier as RandomForestClassifier_gpu
from cuml.neighbors import (KNeighborsClassifier as KNeighborsClassifier_gpu,
                            KNeighborsRegressor as KNeighborsRegressor_gpu)

from time import time
from timeit import Timer, timeit

import cutecharts.charts as ctc 

import cuml
print(cuml.__version__)

0.15.0


# Classification

In [114]:
models = []
results_sklearn = []
results_cuml = []

In [2]:
X, y  = datasets.make_classification(n_samples=40000)

In [3]:
X = X.astype(np.float32)
y = y.astype(np.float32)

In [4]:
def train_data(model, X=X, y=y):
    clf = model
    clf.fit(X, y)

In [60]:
def plot(sklearn_time, cuml_time):
    import matplotlib.pyplot as plt 

    chart = ctc.Bar('Sklearn vs cuml')
    chart.set_options(
        labels=['sklearn', 'cuml'],
        x_label='library',
        y_label='time (s)',
        )

    chart.add_series('time', data=[round(sklearn_time.average,2), round(cuml_time.average,2)])
    return chart
#     chart.render_notebook()


## SVC

In [85]:
clf_svc = SVC(kernel='poly', degree=2, gamma='auto', C=1)
sklearn_time_svc = %timeit -o train_data(clf_svc)

50.2 s ± 1.86 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [84]:
clf_svc = SVC_gpu(kernel='poly', degree=2, gamma='auto', C=1)
cuml_time_svc = %timeit -o train_data(clf_svc)

23.7 s ± 5.36 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
print(f"""Average time of sklearn's {clf_svc.__class__.__name__}""", sklearn_time_svc.average, 's')
print(f"""Average time of cuml's {clf_svc.__class__.__name__}""", cuml_time_svc.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_svc.average/cuml_time_svc.average)

Average time of sklearn's SVC 48.56009825014287 s
Average time of cuml's SVC 19.611496431714304 s
Ratio between sklearn and cuml is 2.476103668030909


In [2]:
19.611496431714304/2.8349827045714164

6.917677628188249

In [61]:
plot(sklearn_time_svc, cuml_time_svc).render_notebook()

In [115]:
models.append(clf_svc.__class__.__name__)
results_sklearn.append(round(sklearn_time_svc.average,3))
results_cuml.append(round(cuml_time_svc.average,3))

## Random Forest Classifier

In [23]:
clf_rf = RandomForestClassifier(max_features=1.0,
#                    n_bins=8,
                   n_estimators=40)
sklearn_time_rf = %timeit -o train_data(clf_rf)

29.8 s ± 394 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [83]:
clf_rf = RandomForestClassifier_gpu(max_features=1.0,
#                    n_bins=8,
                   n_estimators=40)
cuml_time_rf = %timeit -o train_data(clf_rf)

443 ms ± 7.39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
print(f"""Average time of sklearn's {clf_rf.__class__.__name__}""", sklearn_time_rf.average, 's')
print(f"""Average time of cuml's {clf_rf.__class__.__name__}""", cuml_time_rf.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_rf.average/cuml_time_rf.average)

Average time of sklearn's RandomForestClassifier 29.824075075857113 s
Average time of cuml's RandomForestClassifier 0.49404465585715635 s
Ratio between sklearn and cuml is 60.3671646323408


In [62]:
plot(sklearn_time_rf, cuml_time_rf).render_notebook()

In [None]:
0.31644228657140566

## Nearest Neighbors Classifier

In [13]:
clf_nn = KNeighborsClassifier(n_neighbors=10)
sklearn_time_nn = %timeit -o train_data(clf_nn)

78.4 ms ± 1.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
clf_nn = KNeighborsClassifier_gpu(n_neighbors=10)
cuml_time_nn = %timeit -o train_data(clf_nn)

4.25 ms ± 188 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
print(f"""Average time of sklearn's {clf_nn.__class__.__name__}""", sklearn_time_nn.average, 's')
print(f"""Average time of cuml's {clf_nn.__class__.__name__}""", cuml_time_nn.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_nn.average/cuml_time_nn.average)

Average time of sklearn's KNeighborsClassifier 0.07836367340000508 s
Average time of cuml's KNeighborsClassifier 0.004251259535714585 s
Ratio between sklearn and cuml is 18.43304854518441


In [64]:
plot(sklearn_time_nn, cuml_time_nn).render_notebook()

In [117]:
models.append(clf_nn.__class__.__name__)
results_sklearn.append(round(sklearn_time_nn.average,3))
results_cuml.append(round(cuml_time_nn.average,3))

# Make Regression

In [16]:
X, y  = datasets.make_regression(n_samples=40000)

## Linear Regression

In [17]:
lr = LinearRegression(fit_intercept = True, normalize = False,)
sklearn_time_lr = %timeit -o train_data(lr)

4.81 ms ± 10.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
lr = LinearRegression_gpu(fit_intercept = True, normalize = False,)
cuml_time_lr = %timeit -o train_data(lr)

5.92 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
print(f"""Average time of sklearn's {lr.__class__.__name__}""", sklearn_time_lr.average, 's')
print(f"""Average time of cuml's {lr.__class__.__name__}""", cuml_time_lr.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_lr.average/cuml_time_lr.average)

Average time of sklearn's LinearRegression 0.004809828324285457 s
Average time of cuml's LinearRegression 0.0059197622856897525 s
Ratio between sklearn and cuml is 0.8125036263555017


In [70]:
plot(sklearn_time_lr, cuml_time_lr).render_notebook()

In [1]:
0.0059197622856897525/0.0024577860001175266

2.4085751507278017

## Ridge Regresion

In [20]:
alpha = np.array([1e-5])

ridge = Ridge(alpha = alpha, fit_intercept = True, normalize = False)
sklearn_time_ridge = %timeit -o train_data(ridge)

21.1 ms ± 3.71 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
ridge = Ridge_gpu(alpha = alpha, fit_intercept = True, normalize = False)
cuml_time_ridge = %timeit -o train_data(ridge)

6.1 ms ± 125 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [22]:
print(f"""Average time of sklearn's {ridge.__class__.__name__}""", sklearn_time_ridge.average, 's')
print(f"""Average time of cuml's {ridge.__class__.__name__}""", cuml_time_ridge.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_ridge.average/cuml_time_ridge.average)

Average time of sklearn's Ridge 0.021097556481428716 s
Average time of cuml's Ridge 0.00610267809714287 s
Ratio between sklearn and cuml is 3.457098038860364


In [71]:
plot(sklearn_time_ridge, cuml_time_ridge).render_notebook()

In [None]:
0.00610267809714287/0.0033603680528572766 

## Nearest Neighbors Regression


In [79]:
X, y = datasets.make_blobs(n_samples=4000, centers=5,
                  n_features=10)

knn = KNeighborsRegressor(n_neighbors=10)

sklearn_time_knn = %timeit -o train_data(knn)

75.8 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [80]:
knn = KNeighborsRegressor_gpu(n_neighbors=10)

cuml_time_knn = %timeit -o train_data(knn)

1.66 ms ± 147 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [81]:
plot(sklearn_time_knn, cuml_time_knn).render_notebook()

In [120]:
models.append(knn.__class__.__name__)
results_sklearn.append(round(sklearn_time_knn.average,3))
results_cuml.append(round(cuml_time_knn.average,3))

# Comparison table

In [121]:
comparison_table = pd.DataFrame({'sklearn(s)': results_sklearn,
             'cuml(s)': results_cuml},
            index=models)

In [122]:
comparison_table['sklearn/cuml'] = comparison_table.apply(lambda row: row['sklearn(s)']/ row['cuml(s)'], axis=1)

In [123]:
comparison_table

Unnamed: 0,sklearn(s),cuml(s),sklearn/cuml
SVC,50.243,23.689,2.120942
RandomForestClassifier,29.824,0.443,67.322799
KNeighborsClassifier,0.078,0.004,19.5
LinearRegression,0.005,0.006,0.833333
Ridge,0.021,0.006,3.5
KNeighborsRegressor,0.076,0.002,38.0


In [124]:
comparison_table.to_pickle('comparison_table.pkl')