## Exploring the speed-up in Scikit-Learn ML algorithms from Intel® Extension (without extension activated)

Begin in the usual way by importing a load of packages and methods:

In [1]:
import pandas as pd, numpy as np, time, warnings

#from sklearnex import patch_sklearn
#patch_sklearn()

import sklearn
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

warnings.filterwarnings('ignore')

sklearn.__version__

'1.0.1'

Create a synthetic dataset (100k rows x 50 columns) with binary class labels to use for these tests:

In [2]:
features,labels = sklearn.datasets.make_classification(n_samples=1000_000, n_features=50)

cols_ml = ['feature'+str(x) for x in range(50)]

# for convenience let's put it all together in a pandas dataframe:
dataset = pd.DataFrame(data=features, columns=cols_ml)
dataset['Target'] = labels

# inspect the resulting dataframe
dataset

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature41,feature42,feature43,feature44,feature45,feature46,feature47,feature48,feature49,Target
0,-0.422800,-0.584662,0.462237,0.989699,-1.121211,-0.635441,-0.548132,-0.530651,2.172797,0.603010,...,2.159485,0.901096,0.018292,-0.265369,-1.119584,-0.070546,-1.168910,0.262057,2.160698,0
1,-1.041315,-1.206425,-0.309431,-1.920550,-0.906647,0.392948,-0.447752,0.539187,0.657150,1.960977,...,1.013742,0.237744,-0.231629,-0.665971,-0.252495,-1.359173,-1.141313,0.290716,-1.033460,0
2,-1.541894,-1.919632,-0.461707,1.108704,-1.494200,2.202662,-0.115268,-0.338679,-1.450074,-1.471831,...,-3.796161,-0.753390,-0.501331,-2.269483,0.027072,0.404530,-1.253471,-0.707580,1.068665,1
3,0.426785,0.312948,0.216396,-2.001491,-0.597713,-0.285472,-0.119522,-1.817114,-0.629797,0.584540,...,1.751154,-0.686781,0.532474,-1.448586,2.279289,-0.628497,0.632469,-1.242756,0.573785,0
4,0.639737,0.219193,-1.287882,1.297504,1.459814,0.942885,-0.221012,1.396323,0.412051,0.473779,...,-1.351119,-1.195827,0.019775,-2.226484,-0.198740,-0.085489,-0.479642,-0.533513,-1.782120,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,-0.211418,1.032209,1.541046,1.829190,1.429372,-0.265037,-1.317229,0.381129,0.582187,1.970145,...,-1.192479,0.430262,0.260991,1.347965,-1.540521,-1.290673,0.099129,0.577689,-1.616334,1
999996,0.863530,0.301672,0.108365,-0.405745,0.401090,1.047433,-0.887619,0.456457,1.008957,-1.073167,...,1.025543,-1.595556,-0.920994,-0.462491,-1.073571,-1.703985,0.660477,0.563899,1.615149,1
999997,-0.046031,-0.643225,-1.621285,1.478438,-0.339183,0.025995,1.083149,-0.233827,0.707580,-0.835927,...,-1.527178,-1.871331,-0.422885,-1.404959,-0.535566,-1.344177,0.103058,1.189367,0.966756,1
999998,-0.311873,0.635279,-1.035370,-1.170977,-0.189695,-0.468615,-0.697571,0.835783,-0.939901,-0.950801,...,0.203080,1.306455,-1.275350,1.070224,-0.956421,0.750739,-0.952765,0.513875,0.854141,0


In [3]:
scaler = StandardScaler()

In [4]:
%%timeit
dataset[cols_ml] = scaler.fit_transform(dataset[cols_ml])

1.91 s ± 34.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
# when using timeit I do each operation a second time to get the output
dataset[cols_ml] = scaler.fit_transform(dataset[cols_ml])

Calculate and apply a PCA transform:

In [6]:
#%%timeit
#dimReducer = PCA(n_components=10)
#result = dimReducer.fit_transform(dataset[cols_ml].to_numpy())
# the result isn't used for anything in this notebook, but the test ins included for interest
# commented out due to heavy memory requirements

Do a train / test split:

In [7]:
%%timeit
X_train, X_test, y_train, y_test = train_test_split(dataset[cols_ml].to_numpy(),dataset['Target'].to_numpy(),test_size=0.3,random_state=1984)

276 ms ± 2.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
# to avoid waiting too long for the classifiers to train, let's downsample the data from 1m to 20k rows
dataset = dataset.sample(n=20_000)
X_train, X_test, y_train, y_test = train_test_split(dataset[cols_ml],dataset['Target'],test_size=0.3,random_state=1984)

Fit a random forest classifier and predict classes:

In [9]:
clf = RandomForestClassifier(n_estimators=1000,n_jobs=4)

In [10]:
%%timeit
clf.fit(X_train,y_train)

31.9 s ± 417 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
clf.fit(X_train,y_train)

RandomForestClassifier(n_estimators=1000, n_jobs=4)

In [12]:
%%timeit
preds = clf.predict_proba(X_test)[:,1]

321 ms ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
preds = clf.predict_proba(X_test)[:,1]

Now let's try the KNeighborsClassifier (without the intel extension):

In [14]:
clf = KNeighborsClassifier(n_jobs=4)

In [15]:
%%timeit
clf.fit(X_train,y_train)

1.8 ms ± 27.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
clf.fit(X_train,y_train)

KNeighborsClassifier(n_jobs=4)

In [17]:
%%timeit
preds = clf.predict_proba(X_test)[:,1]

7.36 s ± 814 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
preds = clf.predict_proba(X_test)[:,1]