In [3]:
from dask.distributed import Client

client = Client(n_workers=4, threads_per_worker=1)
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:65271  Dashboard: http://127.0.0.1:65274/status,Cluster  Workers: 4  Cores: 4  Memory: 33.88 GB


In [4]:
from sklearn.datasets import make_classification
from sklearn.svm import SVC #support vector machines
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [5]:
X, y = make_classification(n_samples=1000, random_state=0)
X[:5]

array([[-1.06377997,  0.67640868,  1.06935647, -0.21758002,  0.46021477,
        -0.39916689, -0.07918751,  1.20938491, -0.78531472, -0.17218611,
        -1.08535744, -0.99311895,  0.30693511,  0.06405769, -1.0542328 ,
        -0.52749607, -0.0741832 , -0.35562842,  1.05721416, -0.90259159],
       [ 0.0708476 , -1.69528125,  2.44944917, -0.5304942 , -0.93296221,
         2.86520354,  2.43572851, -1.61850016,  1.30071691,  0.34840246,
         0.54493439,  0.22532411,  0.60556322, -0.19210097, -0.06802699,
         0.9716812 , -1.79204799,  0.01708348, -0.37566904, -0.62323644],
       [ 0.94028404, -0.49214582,  0.67795602, -0.22775445,  1.40175261,
         1.23165333, -0.77746425,  0.01561602,  1.33171299,  1.08477266,
        -0.97805157, -0.05012039,  0.94838552, -0.17342825, -0.47767184,
         0.76089649,  1.00115812, -0.06946407,  1.35904607, -1.18958963],
       [-0.29951677,  0.75988955,  0.18280267, -1.55023271,  0.33821802,
         0.36324148, -2.10052547, -0.4380675 , -

In [8]:
param_grid = {"C": [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
              "kernel": ['rbf', 'poly', 'sigmoid'],
              "shrinking": [True, False]}

grid_search = GridSearchCV(SVC(gamma='auto', random_state=0, probability=True),
                           param_grid=param_grid,
                           return_train_score=False,
                           iid=True,
                           cv=3,
                           n_jobs=-1)

In [9]:
grid_search.fit(X, y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
  verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'shrinking': [True, False], 'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=0)

In [6]:
grid_search.predict(X)[:5]

array([0, 1, 1, 1, 0])

In [7]:
grid_search.score(X, y)

0.972

In [8]:
#to fit using the cluster we use the joblib
#Joblib is a library for simple parallel programming primarily developed and used by the Scikit Learn community
from sklearn.externals import joblib

with joblib.parallel_backend('dask'):
    grid_search.fit(X, y)

In [9]:
pd.DataFrame(grid_search.cv_results_).head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_C,param_kernel,param_shrinking,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,std_fit_time,std_score_time,std_test_score
0,0.333442,0.04621,0.502,0.001,rbf,True,"{'kernel': 'rbf', 'shrinking': True, 'C': 0.001}",41,0.502994,0.501502,0.501502,0.041573,0.009715,0.000704
1,0.328788,0.027593,0.502,0.001,rbf,False,"{'kernel': 'rbf', 'shrinking': False, 'C': 0.001}",41,0.502994,0.501502,0.501502,0.019818,0.008317,0.000704
2,0.219746,0.011635,0.502,0.001,poly,True,"{'kernel': 'poly', 'shrinking': True, 'C': 0.001}",41,0.502994,0.501502,0.501502,0.020622,0.00047,0.000704
3,0.217086,0.011635,0.502,0.001,poly,False,"{'kernel': 'poly', 'shrinking': False, 'C': 0....",41,0.502994,0.501502,0.501502,0.015067,0.00094,0.000704
4,0.404584,0.038231,0.502,0.001,sigmoid,True,"{'kernel': 'sigmoid', 'shrinking': True, 'C': ...",41,0.502994,0.501502,0.501502,0.059318,0.015285,0.000704


In [11]:
from dask_ml.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

ValueError: high is out of bounds for int32

In [None]:
import dask_xgboost
import xgboost

In [None]:
params = {'objective': 'binary:logistic',
          'max_depth': 4, 'eta': 0.01, 'subsample': 0.5,
          'min_child_weight': 0.5}

bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=10)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

ax = xgboost.plot_importance(bst, height=0.8, max_num_features=9)
ax.grid(False, axis="y")
ax.set_title('Estimated feature importance')
plt.show()

In [None]:
y_hat = dask_xgboost.predict(client, bst, X_test).persist()
y_hat

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_test, y_hat)

In [None]:
from sklearn.metrics import auc

fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(fpr, tpr, lw=3,
        label='ROC Curve (area = {:.2f})'.format(auc(fpr, tpr)))
ax.plot([0, 1], [0, 1], 'k--', lw=2)
ax.set(
    xlim=(0, 1),
    ylim=(0, 1),
    title="ROC Curve",
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
)
ax.legend();
plt.show()