# Splitter

This example notebook will show how to split data into several groups based on desired parameters and metrics.

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=[FutureWarning, DeprecationWarning])

import sys
import logging
import numpy as np
import pandas as pd

from abacus.splitter.params import SplitBuilderParams
from abacus.splitter.split_builder import SplitBuilder

logging.basicConfig(level = logging.INFO)

%load_ext autoreload
%autoreload 2

Some modifications to existed data:

In [None]:
df = pd.read_csv('./data/ab_data_height.csv', nrows=15_000)

df["moda_city"] = np.random.randint(1, 5, df.shape[0])
df["moda_city"] = df["moda_city"].astype(str)
df["country"] = np.random.randint(1, 3, df.shape[0])
df["id"] = df.index

df.head()

Parameters for splitting:

In [None]:
split_builder_params = SplitBuilderParams(
    map_group_names_to_sizes={
        'control': 10_000,
        'target': 10_000
    },
    main_strata_col = "moda_city",
    split_metric_col = "height_now",
    metric_type = "continuous",
    id_col = "id",
    cols=["height_prev"],
    cat_cols=["country"],
    alpha=0.05,
    n_bins=3,
    min_cluster_size = 500
)

In [None]:
split_builder = SplitBuilder(df, split_builder_params)

Splitting simulation:

In [None]:
split = split_builder.collect()

Take a look at new column with group name:

In [None]:
split.head()