In [60]:
# To make sure to be in the right Python virtual environment
import sys
print(sys.executable)

/home/user/datacraft/rust_for_data_science/.venv/bin/python


In [61]:
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
import time

## Dataset definition and visualization

In [63]:
n_blobs = 5
X, y = make_blobs(n_samples=1234567, centers=n_blobs, cluster_std=0.7)#, random_state=0)
y = y.astype(np.int32)

labels, counts = np.unique(y, return_counts=True)

print(X.shape)
print(X[: 10])
print(y[: 10])

# plt.scatter(X[:, 0], X[:, 1], c=y)
# plt.title("Blobs clusters")
# plt.show()

(1234567, 2)
[[ 2.83504537  8.91090269]
 [ 2.34610109 -2.18009982]
 [-3.70175837  7.86309118]
 [ 1.2806828  -3.81573969]
 [-9.08803515  5.4218733 ]
 [-8.57875884  4.53413153]
 [-3.24260687  5.53766794]
 [ 1.64619415 -1.82351675]
 [ 1.17192308 -4.08816728]
 [-4.5593352   4.6380686 ]]
[1 4 0 4 3 3 0 4 4 2]


## Binary multi-class stratification

### Python implementation

In [88]:
def stratif_train_test_split(X, y, test_size=0.2, train_size=None):

    # Parameters
    n_samples, _ = X.shape
    labels = np.unique(y)
    n_labels = len(labels)
    train_input = True
    if train_size == None:
        train_input = False

    X_train_set = []
    X_test_set = []
    y_train_set = []
    y_test_set = []

    # Constructs subsets
    sub_label_sets = dict(zip(labels, [[] for _ in range(n_labels)]))
    for i in range(n_samples):
        sub_label_sets[y[i]].append(X[i])

    # Constructs train and test sets
    sub_test_sets = dict(zip(labels, [[] for _ in range(len(labels))]))
    sub_train_sets = dict(zip(labels, [[] for _ in range(len(labels))]))

    for l in range(n_labels):
        sub_label_set = sub_label_sets[l]
        n_samples_sub_label_set = len(sub_label_set)
        test_prop = 0
        train_prop = 0
        for i in range(len(sub_label_set)):
            if test_prop <= test_size:
                sub_test_sets[l].append(sub_label_set[i])
                test_prop += 1 / n_samples_sub_label_set
                y_test_set.append(l)
            else:
                if train_input:
                    if train_prop > train_size:
                        break
                sub_train_sets[l].append(sub_label_set[i])
                train_prop += 1/ n_samples_sub_label_set
                y_train_set.append(l)

    for l in range(n_labels):
        X_train_set += sub_train_sets[l]
        X_test_set += sub_test_sets[l]

    X_train_set = np.array(X_train_set)
    X_test_set = np.array(X_test_set)
    y_train_set = np.array(y_train_set)
    y_test_set = np.array(y_test_set)

    return X_train_set, X_test_set, y_train_set, y_test_set

### Tests

#### Python with sklearn

In [83]:
# Python with sklearn
start = time.time()
X_train_py, X_test_py, y_train_py, y_test_py = train_test_split(X, y, test_size=0.33, stratify=y)
print("Python sklearn time :", time.time() - start)
print("Python sklearn result: train_set length is", len(X_train_py), ", test_set length is", len(X_test_py))


Python sklearn time : 0.8665854930877686
Python sklearn result: train_set length is 827159 , test_set length is 407408


#### Python with manual implementation

In [87]:
# Python with manual implementation
start = time.time()
X_train_py_m, X_test_py_m, y_train_py_m, y_test_py_m = stratif_train_test_split(X, y, test_size=0.33)
print("Python scratch time :", time.time() - start)
print("Python scratch result: train_set length is", len(X_train_py_m), ", test_set length is", len(X_test_py_m))

Python scratch time : 5.258981943130493
Python scratch result: train_set length is 827157 , test_set length is 407410


#### Rust with manual implementation

In [67]:
import rust_pyo3

In [85]:
# Rust with manual implmentation
start = time.time()
X_train_rust, X_test_rust, y_train_rust, y_test_rust = rust_pyo3.stratified_train_test_split(X, y, test_size=0.33)
print("Rust time :", time.time() - start)
print("Rust result: train_set length is", len(X_train_py_m), ", test_set length is", len(X_test_py_m))

Rust time : 2.497755765914917
Rust result: train_set length is 827157 , test_set length is 407410


### Mean time

In [50]:
N = 1000

In [52]:
py_sk_time = []
py_time = []
rust_time = []

for _ in range(N):
    start = time.time()
    X_train_py, X_test_py, y_trai_py, y_test_py = train_test_split(X, y, test_size=0.33)
    py_sk_time.append(time.time() - start)

    start = time.time()
    X_train_py_m, X_test_py_m, y_train_py_m, y_test_py_m = stratif_train_test_split(X, y, test_size=0.33)
    py_time.append(time.time() - start)

    start = time.time()
    X_train_rust, X_test_rust, y_train_rust, y_test_rust = rust_pyo3.stratified_train_test_split(X, y, test_size=0.33)
    rust_time.append(time.time() - start)

E_py_sk = np.mean(py_sk_time)
E_py = np.mean(py_time)
E_rust = np.mean(rust_time)

print("Mean time execution Python with sklearn :", E_py_sk)
print("Mean time execution Python with manual implementation :", E_py)
print("Mean time execution Rust with manual implementation :", E_rust)

Mean time execution Python with sklearn : 0.006308325529098511
Mean time execution Python with manual implementation : 0.2346442768573761
Mean time execution Rust with manual implementation : 0.17573338747024536


## K-fold multi-class stratification

### Python Implementation

In [None]:
def stratif_kfold_split(X, y, K, test_size=0.2, train_size=None):

    # Parameters
    n_samples, _ = X.shape
    labels = np.unique(y)
    n_labels = len(labels)
    train_input = True
    if train_size == None:
        train_input = False

    X_train_set = []
    X_test_set = []
    y_train_set = []
    y_test_set = []

    # Constructs subsets
    sub_label_sets = dict(zip(labels, [[] for _ in range(n_labels)]))
    for i in range(n_samples):
        sub_label_sets[y[i]].append(X[i])

    # Constructs train and test sets
    sub_test_sets = dict(zip(labels, [[] for _ in range(len(labels))]))
    sub_train_sets = dict(zip(labels, [[] for _ in range(len(labels))]))

    for l in range(n_labels):
        sub_label_set = sub_label_sets[l]
        n_samples_sub_label_set = len(sub_label_set)
        test_prop = 0
        train_prop = 0
        for i in range(len(sub_label_set)):
            if test_prop <= test_size:
                sub_test_sets[l].append(sub_label_set[i])
                test_prop += 1 / n_samples_sub_label_set
                y_test_set.append(l)
            else:
                if train_input:
                    if train_prop > train_size:
                        break
                sub_train_sets[l].append(sub_label_set[i])
                train_prop += 1/ n_samples_sub_label_set
                y_train_set.append(l)

    for l in range(n_labels):
        X_train_set += sub_train_sets[l]
        X_test_set += sub_test_sets[l]

    return X_train_set, X_test_set, y_train_set, y_test_set
