In [3]:
# Test-Train Data Splits 
# https://realpython.com/train-test-split-python-data/

import numpy as np
from sklearn.model_selection import train_test_split

In [12]:
x = np.arange(1, 25).reshape(12, 2)
y = np.array([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])

# original dataset arrays:
x

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10],
       [11, 12],
       [13, 14],
       [15, 16],
       [17, 18],
       [19, 20],
       [21, 22],
       [23, 24]])

In [5]:
y

array([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])

In [6]:
# Version 1
# Different each time
x_train, x_test, y_train, y_test = train_test_split(x, y)

# Here is how they were broken down:
x_train

array([[11, 12],
       [ 5,  6],
       [23, 24],
       [21, 22],
       [ 7,  8],
       [ 9, 10],
       [ 1,  2],
       [ 3,  4],
       [13, 14]])

In [7]:
x_test

array([[15, 16],
       [17, 18],
       [19, 20]])

In [8]:
y_train

array([0, 1, 0, 1, 0, 1, 0, 1, 0])

In [9]:
y_test

array([1, 1, 0])

In [13]:
# Version 2
# To make your tests reproducible
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=4, random_state=4
)

# Here is how they were broken down:
x_train

array([[17, 18],
       [ 5,  6],
       [23, 24],
       [ 1,  2],
       [ 3,  4],
       [11, 12],
       [15, 16],
       [21, 22]])

In [14]:
x_test

array([[ 7,  8],
       [ 9, 10],
       [13, 14],
       [19, 20]])

In [15]:
y_train

array([1, 1, 0, 0, 1, 0, 1, 1])

In [16]:
y_test

array([0, 1, 0, 0])

In [17]:
# Verion 3
# Stratified splitting - stratify=y. 
# To (approximately) keep the proportion of y values through the training and test sets

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.33, random_state=4, stratify=y
)

# Here is how they were broken down:
x_train

array([[21, 22],
       [ 1,  2],
       [15, 16],
       [13, 14],
       [17, 18],
       [19, 20],
       [23, 24],
       [ 3,  4]])

In [18]:
x_test

array([[11, 12],
       [ 7,  8],
       [ 5,  6],
       [ 9, 10]])

In [19]:
y_train

array([1, 0, 1, 0, 1, 0, 0, 1])

In [20]:
y_test

array([0, 0, 1, 1])

In [21]:
# Version 4
# turn off data shuffling and random split with shuffle=False:

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.33, shuffle=False
)

# Here is how they were broken down:
x_train

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10],
       [11, 12],
       [13, 14],
       [15, 16]])

In [22]:
x_test

array([[17, 18],
       [19, 20],
       [21, 22],
       [23, 24]])

In [23]:
y_train

array([0, 1, 1, 0, 1, 0, 0, 1])

In [24]:
y_test

array([1, 0, 1, 0])