In [1]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
  
# fetch dataset 
horse_colic = fetch_ucirepo(id=47) 
  
# data (as pandas dataframes) 
X = horse_colic.data.features 
y = horse_colic.data.targets 
  
# metadata 
df = pd.concat([X, y], axis=1)

In [2]:
df

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdominocentesis_total_protein,outcome,lesion_site,lesion_type,lesion_subtype,cp_data,surgical_lesion
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,11300,0,0,2,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2208,0,0,2,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,0,0,0,1,2
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,2208,0,0,1,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,4300,0,0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,2.0,1,529695,38.6,60.0,30.0,1.0,1.0,3.0,1.0,...,40.0,6.0,1.0,,1.0,3205,0,0,2,1
364,2.0,1,528452,37.8,42.0,40.0,1.0,1.0,1.0,1.0,...,36.0,6.2,,,1.0,4124,0,0,2,2
365,1.0,1,534783,38.0,60.0,12.0,1.0,1.0,2.0,1.0,...,44.0,65.0,3.0,2.0,3.0,2209,0,0,2,1
366,2.0,1,528926,38.0,42.0,12.0,3.0,,3.0,1.0,...,37.0,5.8,,,1.0,3111,0,0,2,2


In [6]:
for c in df.columns:
    if df[c].dtype not in ['float64', 'int64']:
        print(f"{c}:", df[c].unique())

In [32]:
def binarize(col):
    col[col == 'normal'] = 1.0
    col[col == 'abnormal'] = 0.0

    col[col == 'yes'] = 1.0
    col[col == 'no'] = 0.0
    col[col == '\tno'] = 0.0

    col[col == 'present'] = 1.0
    col[col == 'notpresent'] = 0.0

    col[col == 'good'] = 1.0
    col[col == 'poor'] = 0.0

    # Binarize labels
    col[col == 'ckd'] = 1
    col[col == 'ckd\t'] = 1
    col[col == 'notckd'] = 0
    return col

df = df.apply(binarize, axis=1)

In [10]:
df['surgical_lesion'] = df['surgical_lesion'] - 1

In [11]:
import json
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import KFold

outdir = Path('.')
outdir.mkdir(exist_ok=True)

n_splits = 10
n_folds = 5
idx = np.arange(len(df))

kf_splits = KFold(n_splits=n_splits, random_state=1896, shuffle=True)

for holdout_num, out_split in enumerate(kf_splits.split(idx)):
    idx_train = idx[out_split[0]]
    idx_test = idx[out_split[1]]
    devel_fold = df.iloc[idx_train, ]
    test_fold = df.iloc[idx_test, ]

    # Check that we haven't got any duplicates
    temp = pd.concat([devel_fold, test_fold])
    assert temp.duplicated().sum() == 0

    test_fold.to_csv(outdir / f'holdout_{holdout_num}.csv', index=False)

    kf_folds = KFold(n_splits=n_folds, random_state=165782 * holdout_num, shuffle=True)
    idx_folds = np.arange(len(devel_fold))
    for fold_num, idx_fold_split in enumerate(kf_folds.split(idx_folds)):
        train_fold = devel_fold.iloc[idx_fold_split[0]]
        val_fold = devel_fold.iloc[idx_fold_split[1]]
        train_fold.to_csv(outdir / f'devel_{holdout_num}_train_{fold_num}.csv', index=False)
        val_fold.to_csv(outdir / f'devel_{holdout_num}_val_{fold_num}.csv', index=False)