# Snippet

In [24]:
import uuid
from zlib import crc32


def split_check(
    unique_id: int or str, test_ratio: float, validation_ratio: float = 0
) -> str:
    byteslike_identifier = bytes(str(unique_id), "utf-8")
    hashed = crc32(byteslike_identifier) & 0xFFFFFFFF

    max_hash = 2 ** 32 - 1
    test_boundary = max_hash - max_hash * 0.2
    validation_boundary = test_boundary - max_hash * 0.2
    
    if hashed >= test_boundary:
        return "test"
    elif hashed < test_boundary and hashed >= validation_boundary:
        return "validation"
    else:
        return "training"

def add_split_column(data, id_column, test_ratio, **kwargs):
    split_assignment = data[id_column].apply(lambda x: split_check(x, test_ratio, kwargs["validation_ratio"]))
    return data.assign(split_assignment=split_assignment)

# Demo

In [25]:
import pandas as pd
df = pd.DataFrame({"id": range(1, 2000)})

In [26]:
df

Unnamed: 0,id
0,1
1,2
2,3
3,4
4,5
...,...
1994,1995
1995,1996
1996,1997
1997,1998


In [27]:
df_split = add_split_column(df, "id", 0.2, validation_ratio=0.2)

In [28]:
df_split

Unnamed: 0,id,split_assignment
0,1,training
1,2,training
2,3,training
3,4,test
4,5,training
...,...,...
1994,1995,training
1995,1996,training
1996,1997,test
1997,1998,training


In [23]:
df_split["which_set"].value_counts()

training      1232
test           426
validation     341
Name: which_set, dtype: int64