In [1]:
# To make sure to be in the right Python virtual environment
import sys
print(sys.executable)

/home/user/datacraft/rust_for_data_science/.venv/bin/python


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split, StratifiedKFold
import time

## Dataset definition and visualization

In [75]:
n_blobs = 4
X, y = make_blobs(n_samples=100000, centers=n_blobs, cluster_std=0.7)#, random_state=0)
y = y.astype(np.int32)

labels, counts = np.unique(y, return_counts=True)

print(X.shape)
print(X[: 10])
print(y[: 10])

# plt.scatter(X[:, 0], X[:, 1], c=y)
# plt.title("Blobs clusters")
# plt.show()

(100000, 2)
[[ 4.6856834  -2.30292619]
 [-0.47142936  0.01453754]
 [ 3.27857164 -5.73276009]
 [ 0.09436453  0.05786487]
 [ 2.40886816 -0.35340859]
 [ 1.82961202 -1.22465973]
 [ 4.77389859 -1.14232579]
 [-7.82092982  3.24291482]
 [ 5.35471661 -0.12146116]
 [ 4.34379688 -5.02951315]]
[0 1 0 1 1 1 2 3 2 0]


## Binary multi-class stratification

### Python implementation

In [4]:
def stratif_train_test_split(X, y, test_size=0.2, train_size=None):

    # Parameters
    n_samples, _ = X.shape
    labels = np.unique(y)
    n_labels = len(labels)
    train_input = True
    if train_size == None:
        train_input = False

    X_train_set = []
    X_test_set = []
    y_train_set = []
    y_test_set = []

    # Constructs subsets
    sub_label_sets = dict(zip(labels, [[] for _ in range(n_labels)]))
    for i in range(n_samples):
        sub_label_sets[y[i]].append(X[i])

    # Constructs train and test sets
    sub_test_sets = dict(zip(labels, [[] for _ in range(len(labels))]))
    sub_train_sets = dict(zip(labels, [[] for _ in range(len(labels))]))

    for l in range(n_labels):
        sub_label_set = sub_label_sets[l]
        n_samples_sub_label_set = len(sub_label_set)
        test_prop = 0
        train_prop = 0
        for i in range(n_samples_sub_label_set):
            if test_prop <= test_size:
                sub_test_sets[l].append(sub_label_set[i])
                test_prop += 1 / n_samples_sub_label_set
                y_test_set.append(l)
            else:
                if train_input:
                    if train_prop > train_size:
                        break
                sub_train_sets[l].append(sub_label_set[i])
                train_prop += 1/ n_samples_sub_label_set
                y_train_set.append(l)

    for l in range(n_labels):
        X_train_set += sub_train_sets[l]
        X_test_set += sub_test_sets[l]

    X_train_set = np.array(X_train_set)
    X_test_set = np.array(X_test_set)
    y_train_set = np.array(y_train_set)
    y_test_set = np.array(y_test_set)

    return X_train_set, X_test_set, y_train_set, y_test_set

### Tests

#### Python with sklearn

In [15]:
# Python with sklearn
start = time.perf_counter()
X_train_py, X_test_py, y_train_py, y_test_py = train_test_split(X, y, test_size=0.33, stratify=y)
print("Python sklearn time :", time.perf_counter() - start)
print("Python sklearn result: train_set length is", len(X_train_py), ", test_set length is", len(X_test_py))


Python sklearn time : 0.05598309799825074
Python sklearn result: train_set length is 67000 , test_set length is 33000


#### Python with manual implementation

In [25]:
# Python with manual implementation
start = time.perf_counter()
X_train_py_m, X_test_py_m, y_train_py_m, y_test_py_m = stratif_train_test_split(X, y, test_size=0.33)
print("Python scratch time :", time.perf_counter() - start)
print("Python scratch result: train_set length is", len(X_train_py_m), ", test_set length is", len(X_test_py_m))

Python scratch time : 0.25572829299926525
Python scratch result: train_set length is 67000 , test_set length is 33000


#### Rust with manual implementation

In [27]:
import rust_pyo3

In [33]:
# Rust with manual implmentation
start = time.perf_counter()
X_train_rust, X_test_rust, y_train_rust, y_test_rust = rust_pyo3.stratified_train_test_split(X, y, test_size=0.33)
print("Rust time :", time.perf_counter() - start)
print("Rust result: train_set length is", len(X_train_py_m), ", test_set length is", len(X_test_py_m))

Rust time : 0.03669349900155794
Rust result: train_set length is 67000 , test_set length is 33000


In [34]:
labels, counts = np.unique(y_test_py, return_counts=True)
print(counts/len(y_test_py))
labels, counts = np.unique(y_train_py, return_counts=True)
print(counts/len(y_train_py))
print("  ")
labels, counts = np.unique(y_test_py_m, return_counts=True)
print(counts/len(y_test_py_m))
labels, counts = np.unique(y_train_py_m, return_counts=True)
print(counts/len(y_train_py_m))
print("  ")
labels, counts = np.unique(y_test_rust, return_counts=True)
print(counts/len(y_test_rust))
labels, counts = np.unique(y_train_rust, return_counts=True)
print(counts/len(y_train_rust))

[0.25 0.25 0.25 0.25]
[0.25 0.25 0.25 0.25]
  
[0.25 0.25 0.25 0.25]
[0.25 0.25 0.25 0.25]
  
[0.25 0.25 0.25 0.25]
[0.25 0.25 0.25 0.25]


### Mean time

In [None]:
N = 1000

In [None]:
py_sk_time = []
py_time = []
rust_time = []

for _ in range(N):
    start = time.time()
    X_train_py, X_test_py, y_trai_py, y_test_py = train_test_split(X, y, test_size=0.33)
    py_sk_time.append(time.time() - start)

    start = time.time()
    X_train_py_m, X_test_py_m, y_train_py_m, y_test_py_m = stratif_train_test_split(X, y, test_size=0.33)
    py_time.append(time.time() - start)

    start = time.time()
    X_train_rust, X_test_rust, y_train_rust, y_test_rust = rust_pyo3.stratified_train_test_split(X, y, test_size=0.33)
    rust_time.append(time.time() - start)

E_py_sk = np.mean(py_sk_time)
E_py = np.mean(py_time)
E_rust = np.mean(rust_time)

print("Mean time execution Python with sklearn :", E_py_sk)
print("Mean time execution Python with manual implementation :", E_py)
print("Mean time execution Rust with manual implementation :", E_rust)

## K-fold multi-class stratification

### Python Implementation

In [56]:
def stratif_kfold_split(X, y, K, sizes=None):

    # Parameters
    n_samples, _ = X.shape
    labels = np.unique(y)
    n_labels = len(labels)

    X_kfolds = [[] for _ in range(K)]
    y_kfolds = [[] for _ in range(K)]

    # Constructs subsets
    sub_label_sets = dict(zip(labels, [[] for _ in range(n_labels)]))
    for i in range(n_samples):
        sub_label_sets[y[i]].append(X[i])

    # Constructs folds
    current_fold = 0
    for l in range(n_labels):
        sub_label_set = sub_label_sets[l]
        n_samples_sub_label_set = len(sub_label_set)
        for i in range(n_samples_sub_label_set):
            X_kfolds[current_fold].append(sub_label_set[i])
            y_kfolds[current_fold].append(l)
            current_fold += 1
            current_fold %= K

    return X_kfolds, y_kfolds

In [None]:
start = time.perf_counter()
X_kfolds_py, y_kfolds_py = stratif_kfold_split(X, y, 10)
print("Python scratch time:", time.perf_counter() - start)

for fold in y_kfolds_py:
    _, counts = np.unique(fold, return_counts=True)
    print("Samples repartition in each folds",counts/len(fold))

Python scratch time: 0.2647780410006817
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]


In [94]:
start = time.perf_counter()
X_kfolds_rust, y_kfolds_rust = rust_pyo3.stratified_kfold_split(X, y, 40)
print("Rust time :", time.perf_counter() - start)
for fold in y_kfolds_rust:
    _, counts = np.unique(fold, return_counts=True)
    print("Samples repartition in each folds",counts/len(fold))

Rust time : 0.05996808700001566
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartition in each folds [0.25 0.25 0.25 0.25]
Samples repartit

In [89]:
N = 100
# py_sk_time_kf = []
py_time_kf = []
rust_time_kf = []

for _ in range(N):
    start = time.perf_counter()
    X_kfolds_py, y_kfolds_py = stratif_kfold_split(X, y, 3)
    py_time_kf.append(time.perf_counter() - start)

    start = time.perf_counter()
    X_kfolds_rust, y_kfolds_rust = rust_pyo3.stratified_kfold_split(X, y, 3)
    rust_time_kf.append(time.perf_counter() - start)

# E_py_sk = np.mean(py_sk_time_kf)
E_py_kf = np.mean(py_time_kf)
E_rust_kf = np.mean(rust_time_kf)

# print("Mean time execution Python with sklearn :", E_py_sk)
print("Mean time execution Python with manual implementation :", E_py_kf)
print("Mean time execution Rust with manual implementation :", E_rust_kf)

Mean time execution Python with manual implementation : 0.12632954521013745
Mean time execution Rust with manual implementation : 0.04483465774996148
