In [1]:
from sklearn.datasets import load_breast_cancer
import numpy as np
from collections import Counter

In [2]:
x = load_breast_cancer()['data']
y = load_breast_cancer()['target']

In [3]:
counts = Counter(y) # label counts of y
counts

Counter({0: 212, 1: 357})

In [4]:
y_count = len(y) # count of all labels in y
y_count

569

In [5]:
counts_percent = {key: value / y_count for key, value in counts.items()} # proportions of labels in y
counts_percent

{0: 0.37258347978910367, 1: 0.6274165202108963}

In [6]:
test_prop = 0.3

In [7]:
y_test_count = {label: int(np.ceil(test_prop * prop * y_count)) for label, prop in counts_percent.items()} # count of samples per label in test set
y_test_count

{0: 64, 1: 108}

In [8]:
len_y_test = y_count * test_prop
len_y_test

170.7

In [9]:
if sum([*y_test_count.values()]) > len_y_test:   
    first_key = [*y_test_count.keys()][0] # getting first label
    y_test_count[first_key] -= (sum([*y_test_count.values()]) - len_y_test)  # adding the leftover samples to first label 

In [10]:
y_test_count

{0: 62.69999999999999, 1: 108}

In [11]:
train_samples = np.array([], dtype="int") # creating empty array to hold train sample idx
test_samples = np.array([], dtype="int") # creating empty array to hold test sample idx
for i in [*y_test_count.keys()]: # iterating through labels
    np.random.seed(42)
    samples = np.where(y==i)[0] # selecting idxs from y which are equal to a particular label
    # appending randomly chosen test samples of a particular label to test_samples array
    test_samples = np.append(test_samples, np.random.choice(a=samples, size=int(y_test_count[i]), replace=False))
    # appending samples that are not present in test_samples to train_samples
    train_samples = np.append(train_samples, np.setdiff1d(samples, test_samples)) # return idx in samples that are not test_samples
    x_train, x_test, y_train, y_test = x[train_samples], x[test_samples], y[train_samples], y[test_samples]

In [12]:
train_samples

array([  0,   1,   2,   3,   4,   6,   7,   8,  10,  11,  12,  13,  14,
        17,  23,  24,  25,  26,  29,  30,  31,  34,  35,  36,  38,  39,
        40,  41,  43,  44,  45,  47,  53,  54,  57,  62,  64,  65,  70,
        72,  73,  75,  77,  83,  85,  86,  91,  94,  95,  99, 100, 119,
       121, 122, 127, 131, 132, 134, 138, 141, 156, 162, 167, 168, 171,
       172, 177, 180, 182, 193, 194, 198, 199, 202, 203, 205, 212, 213,
       214, 215, 218, 223, 229, 233, 236, 237, 239, 244, 250, 252, 255,
       256, 257, 258, 259, 260, 261, 263, 265, 272, 277, 280, 283, 297,
       300, 321, 328, 329, 330, 335, 337, 343, 351, 352, 353, 365, 366,
       369, 370, 372, 379, 385, 392, 393, 408, 414, 417, 432, 433, 435,
       444, 460, 461, 468, 489, 492, 498, 499, 501, 509, 512, 514, 516,
       521, 533, 535, 536, 562, 564, 567,  19,  20,  21,  46,  49,  51,
        55,  58,  59,  60,  61,  68,  69,  71,  74,  79,  84,  88,  89,
        90,  93,  96,  98, 101, 102, 103, 104, 107, 109, 111, 11

In [13]:
test_samples

array([ 33, 400, 274, 129,  87, 565,  56, 446,   9, 196, 264, 479, 339,
       146,  15, 517, 373,  18, 566, 164, 135,  78, 317, 563, 323, 282,
        28, 201,  16, 430, 262, 253, 210, 117, 126, 302, 441, 186, 487,
       181, 219, 368, 503, 254, 207, 118, 161, 190, 197,  42,  27, 449,
       230,  32,  22, 451, 184, 108, 105,   5, 389,  82, 395, 110, 481,
       493, 136, 541,  63, 137, 422, 248, 173, 544, 376, 462, 242, 251,
       484, 482, 284, 559, 285, 391, 416,  97, 338, 388, 292, 525,  81,
       157, 450,  52, 550, 293, 232, 106,  76, 561, 192, 187, 537,  92,
       496, 225, 305, 403, 133,  48, 113, 397, 465, 409, 476, 165, 552,
       342,  37, 238, 160, 425, 145, 114, 377, 163, 155, 175, 551, 358,
       434, 191, 470, 356, 471, 539, 477, 159, 436, 557,  66, 149, 271,
        50, 490, 298, 518, 510, 390, 306, 226, 453, 142, 234, 437, 227,
       472, 401,  67, 290,  80, 211, 540, 340, 508, 279, 303, 396, 558,
       364], dtype=int64)

In [14]:
# ensuring no duplicate train samples
assert len(set(train_samples)) == len(train_samples)

In [15]:
# ensuring no duplicate test samples
assert len(set(test_samples)) == len(test_samples)

In [16]:
# ensuring no intersection between train and test samples
assert len(set(test_samples).intersection(set(train_samples))) == 0

In [17]:
len(test_samples)

170

In [18]:
len(train_samples)

399

In [19]:
len(y)

569

In [20]:
assert len(y) == (len(test_samples) + len(train_samples))

In [21]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((399, 30), (399,), (170, 30), (170,))

In [22]:
x = load_breast_cancer()['data']
y = load_breast_cancer()['target']

In [23]:
def stratified_train_test_split(x, y, test_prop=0.3, random_state=42):
    x = x.copy()
    y = y.copy()
    counts = Counter(y) # label counts of y
    y_count = len(y) # count of all labels in y
    counts_percent = {key: value / y_count for key, value in counts.items()} # proportions of labels in y
    y_test_count = {label: int(np.ceil(test_prop * prop * y_count)) for label, prop in counts_percent.items()} # count of samples per label in test set
    
    # Since y_test_count rounds up the sample counts, the count of samples per label might not match the sum of total sample count
    # Hence, adjusting sample counts to match total count.  
    if sum([*y_test_count.values()]) > len_y_test:   
        first_key = [*y_test_count.keys()][0] # getting first label
        y_test_count[first_key] -= (sum([*y_test_count.values()]) - len_y_test)  # adding the leftover samples to first label 
        
    train_samples = np.array([], dtype="int") # creating empty array to hold train sample idx
    test_samples = np.array([], dtype="int") # creating empty array to hold test sample idx
    for i in [*y_test_count.keys()]: # iterating through labels
        np.random.seed(random_state)
        samples = np.where(y==i)[0] # selecting idxs from y which are equal to a particular label
        # appending randomly chosen test samples of a particular label to test_samples array
        test_samples = np.append(test_samples, np.random.choice(a=samples, size=int(y_test_count[i]), replace=False))
        # appending samples that are not present in test_samples to train_samples
        train_samples = np.append(train_samples, np.setdiff1d(samples, test_samples)) # return idx in samples that are not test_samples
        x_train, x_test, y_train, y_test = x[train_samples], x[test_samples], y[train_samples], y[test_samples]
    return x_train, x_test, y_train, y_test

In [24]:
x_train, x_test, y_train, y_test = stratified_train_test_split(x, y)

In [25]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((399, 30), (399,), (170, 30), (170,))

In [26]:
import pandas as pd
# y distribution
pd.Series(y).value_counts(normalize=True)

1    0.627417
0    0.372583
dtype: float64

In [27]:
# y_train distribution
pd.Series(y_train).value_counts(normalize=True)

1    0.62406
0    0.37594
dtype: float64

In [28]:
# y_test distribution
pd.Series(y_test).value_counts(normalize=True)

1    0.635294
0    0.364706
dtype: float64

In [29]:
assert len(y) == (len(y_train) + len(y_test))

In [30]:
assert len(x) == (len(x_train) + len(x_test))