In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../..')
from modules.many_features import utils, constants

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
def sample_train_set(x, y, frac):
    sample_num = int(frac*len(x))
    idx_list = random.sample(list(x.index), sample_num)
    sampled_x = x.loc[idx_list]
    sampled_y = y.loc[idx_list]
    return sampled_x, sampled_y

In [4]:
def sample_train_set2(x, y, frac):
    X_train, X_non, y_train, y_non = train_test_split(x, y, test_size=1-frac, stratify=y, random_state=SEED)
    return X_train, y_train

#### The dataset

In [5]:
train_df = pd.read_csv('../../../anemia_ml4hc/data/train_set_basic.csv')
X_set = train_df.iloc[:, 0:-1]
y_set = train_df.iloc[:, -1]
train_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,14.728733,-1.0,3.170892,-1.0,-1.0,-1.0,-1.0,-1.0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,44.1862,-1.0,0
1,10.405752,9.634615,5.659537,-1.0,-1.0,77.413788,212.671838,4.032519,0,0.88713,96.311597,-1.0,43.218595,-1.0,83.207518,31.217256,-1.0,4
2,15.132737,358.914888,1.842252,3.797487,315.102272,80.500314,-1.0,5.639507,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,45.398211,-1.0,0
3,11.340169,-1.0,1.662209,2.441767,-1.0,97.033963,102.079062,3.506041,1,1.020527,127.281715,-1.0,20.847013,-1.0,62.210273,34.020508,-1.0,6
4,6.691485,-1.0,3.337971,-1.0,-1.0,99.838438,24.119564,2.010694,0,1.957666,34.633063,-1.0,34.612121,-1.0,112.411298,20.074456,-1.0,5


In [6]:
fracs = [0.01, 0.05, 0.1, 0.5]

In [7]:
for frac in fracs:
    x, y = sample_train_set(X_set, y_set, frac)
    df = pd.concat([x.reset_index(drop=True), y.reset_index(drop=True)], axis=1)
    print(f'frac:{frac} - {len(df)} samples')
    df.to_csv(f'../../../anemia_ml4hc/data/train_set_basic_{frac}_seed_{SEED}.csv', index=False)

frac:0.01 - 504 samples
frac:0.05 - 2520 samples
frac:0.1 - 5040 samples
frac:0.5 - 25200 samples
