# core

> Functions to assist in running studies of model training and testing on different sizes and types of data sets.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#| export
import numpy as np
import pandas as pd

In [None]:
#| export

def sample_rows_and_cols(X: pd.DataFrame, y: pd.DataFrame, num_sampled_rows: int, num_sampled_columns: int, random_state: int, replace: bool) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Samples the same num_sampled_rows from X and y and sample num_sampled_columns from X.

    Returns a tuple of the sampled X and y with matched rows, so they must have the same number of rows.

    Sampling is can be with or without replacement based on the boolean value of replace.
    
    Parameters
    ----------
    X
        DataFrame to sample. If replace=False, must have shape >= (num_sampled_rows, num_sampled_columns).
    y
        Single-column dataFrame to sample. If replace=False, must have shape > (num_sampled_rows, 1).
    num_sampled_rows
        Integer number of rows to sample.
    num_sampled_columns
        Integer number of columns to sample.
    random_state
        Integer random state for the random sample, useful for reproducible testing.  Do not set in production.
    replace
        Whether to sample with replacement
        
    Returns
    -------
    X, y
        Sampled version of the input dataframes where X.shape == (num_sampled_rows, num_sampled_columns) and y.shape == (num_sampled_rows, 1)
    """
    # TODO: find a version of random choice that does let users control the random_state.
    indices = np.random.choice(X.index, num_sampled_rows, replace=replace)
    X_subset = X.iloc[indices]
    y_subset = y.iloc[indices]
    X_subset = X_subset.sample(n=num_sampled_columns, random_state=random_state, replace=replace, axis=1)
    return X_subset, y_subset

In [None]:
# Happy case test.
X = pd.DataFrame({'a':range(1,10), 'b':range(2,11), 'c':range(3,12)})
y = pd.DataFrame({'target':range(100,110)})
random_state=1
num_rows=3
num_cols=2
# precondition: y should have one column
test_eq(y.shape[1], 1)
X_sub, y_sub = sample_rows_and_cols(X, y, num_rows, num_cols, random_state=random_state, replace=False)
test_eq(y_sub.shape, (num_rows, 1))
test_eq(X_sub.shape, (num_rows, num_cols))
# TODO: Test that the y rows match the x rows

In [None]:
# Test no data in y.
y = pd.DataFrame()
test_fail(lambda: 
          sample_rows_and_cols(X, y, num_rows, num_cols, random_state=random_state, replace=False),
          contains="out-of-bounds")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()