# Generating and splitting a synthetic dataset

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold

## Generate a synthetic dataset

In [2]:
N_SAMPLES = 1000
N_FEATURES = 25
N_INFORMATIVE = 25
outdir = Path(f'SYNTHETIC_{N_SAMPLES}_SAMPLES_{N_FEATURES}_FEATURES_{N_INFORMATIVE}_INFORMATIVE')
outdir.mkdir(parents=True, exist_ok=True)

In [3]:
x, y = make_classification(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=N_FEATURES,
    n_redundant=0,
    random_state=1234
)

In [4]:
### INJECT MAR MISSINGNESS

added_missingness_num_cols = 1
added_missingness_rate = 0.2

np.random.seed(0)
target_cols = np.array([0])
inter_cols = np.array([1])
targets = np.random.choice([0, 1], size=(y.shape[0], target_cols.shape[0]), p=[1-added_missingness_rate, added_missingness_rate])

for i, col in enumerate(target_cols):
    print(f"Adding missingness to column: {col}")
    thresh_col = inter_cols[i]
    thresh_mask = x[:, thresh_col] >= np.quantile(x[:, thresh_col], .6)
    tartget_labels = np.zeros_like(thresh_mask)
    tartget_labels[thresh_mask] = 1
    mask = (targets[:, i] == 1) & (y == tartget_labels)
    x[mask, col] = np.nan

Adding missingness to column: 0


In [5]:
features_df = pd.DataFrame(x)
output_df = pd.DataFrame(y, columns=['output'])
data_df = pd.concat([features_df, output_df], axis=1)

In [6]:
data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,output
0,0.438197,-1.191451,1.001693,-2.786047,0.576669,1.920034,0.921721,0.156113,-3.917061,0.045911,...,1.413880,-2.602892,0.385898,1.060973,3.266673,2.740426,4.567134,-2.879384,-1.806647,0
1,-1.621732,-0.954239,1.826071,3.776377,-2.505057,-3.979924,-1.711538,1.069452,-3.004176,2.734760,...,-0.164082,1.364287,0.284669,0.951135,0.952302,1.898508,-0.620184,0.093870,2.275429,1
2,5.156404,-3.726071,3.269837,-3.990850,-0.735575,-6.873679,1.661168,-1.472288,0.244928,-1.975712,...,0.778176,0.890527,-1.762711,3.879442,4.936931,0.035877,1.424619,2.244002,-7.220646,0
3,2.006585,2.891556,-5.115467,-2.255901,0.849719,-2.885901,0.568725,-3.877307,-3.473440,2.139569,...,-3.085900,0.117296,-0.924385,0.507172,-1.137078,-0.831522,2.066444,-3.918678,-3.198070,0
4,-3.316983,1.623225,-1.630156,-3.174470,1.295780,-6.640943,-4.988686,1.766230,1.979029,3.869918,...,-3.151733,2.545054,-0.819551,-4.250769,-0.405257,5.266958,0.925338,0.610499,-1.171616,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.764133,-1.328269,-2.195970,-0.668578,3.130584,3.049736,-0.826202,-3.686800,3.544626,-0.226208,...,1.150557,4.887537,0.699942,1.849942,-3.931751,-1.035353,2.399222,3.499846,-0.487522,0
996,-3.792739,-1.555733,3.111561,-3.124815,-5.902222,-4.736217,-1.740016,-3.631429,1.782080,-1.667041,...,1.315356,-0.247061,-2.675003,4.484417,0.887037,3.440439,4.462018,4.585732,-0.805720,1
997,,-0.646899,-4.025954,1.984308,1.500070,0.611716,2.074119,-2.562851,3.943713,-0.966447,...,1.815956,2.333834,-1.518853,1.093008,-3.344606,-1.733780,4.276645,-1.432138,-4.130900,0
998,-0.782770,2.148991,0.953576,0.824542,-0.310086,-0.390065,-3.661193,-1.127242,0.461713,2.136723,...,2.455531,-3.073843,-1.626840,-0.557212,-0.388773,1.291002,-0.626322,0.248460,-0.733973,1


Save a copy of the complete dataset for future reference.

In [7]:
data_df.to_csv(outdir / 'synthetic_complete.csv', index=False)

## Create training and test sets with completely at random missingness

In [8]:
def binary_sampler(p, rows, cols):
    np.random.seed(6289278)
    unif_random_matrix = np.random.uniform(0., 1., size = (rows, cols))
    binary_random_matrix = 1 * (unif_random_matrix < p)
    return binary_random_matrix


def make_missing_mcar(data_df, miss_rate=0.25, outcome_column='output'):
    data_features = data_df.drop(columns=[outcome_column])
    data_features_arr = np.array(data_features)

    n_rows, n_cols = data_features_arr.shape

    data_features_mask = binary_sampler(1 - miss_rate, n_rows, n_cols)
    miss_data_features_arr = data_features_arr.copy()
    miss_data_features_arr[data_features_mask == 0] = np.nan

    miss_data_features = pd.DataFrame(miss_data_features_arr)
    outcome = pd.DataFrame(data_df[outcome_column].reset_index(drop=True))
    
    miss_data = pd.concat([miss_data_features, outcome], axis=1)

    return miss_data

In [9]:
n_splits = 3
n_folds = 5
idx = np.arange(len(data_df))

kf_splits = KFold(n_splits=n_splits, random_state=1896, shuffle=True)

for holdout_num, out_split in enumerate(kf_splits.split(idx)):
    idx_train = idx[out_split[0]]
    idx_test = idx[out_split[1]]
    devel_fold = data_df.iloc[idx_train, ]
    test_fold = data_df.iloc[idx_test, ]

    for train_percentage in [0,0.25,0.50]:
        for test_percentage in [0,0.25,0.50]:
            percent_str = f'train_missing_{train_percentage}_test_missing_{test_percentage}'
            train_data = make_missing_mcar(devel_fold, train_percentage)
            test_data  = make_missing_mcar(test_fold, test_percentage)

            test_data.to_csv(outdir / f'holdout_{holdout_num}_{percent_str}.csv', index=False)

            kf_folds = KFold(n_splits=n_folds, random_state=165782 * holdout_num, shuffle=True)
            idx_folds = np.arange(len(train_data))
            for fold_num, idx_fold_split in enumerate(kf_folds.split(idx_folds)):
                train_fold = train_data.iloc[idx_fold_split[0]]
                val_fold = train_data.iloc[idx_fold_split[1]]
                train_fold.to_csv(outdir / f'devel_{holdout_num}_train_{fold_num}_{percent_str}.csv', index=False)
                val_fold.to_csv(outdir / f'devel_{holdout_num}_val_{fold_num}_{percent_str}.csv', index=False)