 # part 3: dataset construction

 for each variable, split the data into training, validation, and test sets

 * use two validation sets so we can train, validate, and pseudo-test before the final testing stage.
 * [this](https://cs230-stanford.github.io/train-dev-test-split.html) is a nice guide to splitting data for a machine learning project


In [0]:
import pandas as pd
import numpy as np
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import python_modules.constants as constants



In [0]:
np.random.seed(seed=2227)



In [0]:
PATH = 'study_data/'

feature_set_names = [0,1,2,3]
label_set_names   = ['train', 'val_1', 'val_2', 'test']



In [0]:
df_main = pd.read_csv(f'{PATH}/study_data.csv', low_memory=False, index_col=0)



In [0]:
df_leak = df_main['LEAK']
df_clot = df_main['CLOT']

rs_leak = [np.random.randint(500, size=1)[0] for i in range(0,3)]
rs_clot = [np.random.randint(500, size=1)[0] for i in range(0,3)]

df_main = df_main.drop(columns=constants.OUTCOME)



In [0]:
rs_clot


 split out the data. since the outcomes of interest are so rare, stratify by outcome to ensure even splits.

 in order to do it totally evenly, each modeled outcome gets its own dataset

 * this probably isn't necessary and probably doesn't help model. to get rid of stratification, delete the stratify kwarg

In [0]:
def data_splitter(features, labels, outcome, rstates):
        
    X_train, X_test, y_train, y_test   = train_test_split(features, labels, test_size=1/4, stratify=labels, random_state=rstates[0])
    X_train, X_val_1, y_train, y_val_1 = train_test_split(X_train, y_train, test_size=1/3, stratify=y_train, random_state=rstates[1])
    X_train, X_val_2, y_train, y_val_2 = train_test_split(X_train, y_train, test_size=1/2, stratify=y_train, random_state=rstates[2])
    
    
    X_train = X_train.merge(y_train.to_frame(), left_index=True, right_index=True)
    X_val_1 = X_val_1.merge(y_val_1.to_frame(), left_index=True, right_index=True)
    X_val_2 = X_val_2.merge(y_val_2.to_frame(), left_index=True, right_index=True)
    X_test = X_test.merge(y_test.to_frame(), left_index=True, right_index=True)
    
    consolidated_sets = [X_train, X_val_1, X_val_2, X_test]

    # make sure the split sets are sized properly
    print([len(i) for i in consolidated_sets])
    
    # inspect outcome incidences in each split set
    print([i[outcome].value_counts()[True]/len(i) for i in consolidated_sets])
    
    return consolidated_sets



In [0]:
def data_splitter_2(features, labels, outcome, rstates):
        
    X_train, X_test, y_train, y_test   = train_test_split(features, labels, test_size=1/4, stratify=labels, random_state=rstates[0])
    X_train, X_val_1, y_train, y_val_1 = train_test_split(X_train, y_train, test_size=1/3, stratify=y_train, random_state=rstates[1])
    X_train, X_val_2, y_train, y_val_2 = train_test_split(X_train, y_train, test_size=1/2, stratify=y_train, random_state=rstates[2])
    
    
    X_train = X_train.merge(y_train.to_frame(), left_index=True, right_index=True)
    X_val_1 = X_val_1.merge(y_val_1.to_frame(), left_index=True, right_index=True)
    X_val_2 = X_val_2.merge(y_val_2.to_frame(), left_index=True, right_index=True)
    X_test = X_test.merge(y_test.to_frame(), left_index=True, right_index=True)
    
    consolidated_sets = [X_train, X_val_1, X_val_2, X_test]

    # make sure the split sets are sized properly
    print([len(i) for i in consolidated_sets])
    
    # inspect outcome incidences in each split set
    print([i[outcome].value_counts()[True]/len(i) for i in consolidated_sets])
    
    return consolidated_sets



In [0]:
leak_sets = data_splitter(df_main, df_leak, 'LEAK', rs_leak)



In [0]:
clot_sets = data_splitter(df_main, df_clot, 'CLOT', rs_clot)


 build dataframes back up and save them

In [0]:
for i,j,k in zip(leak_sets, clot_sets, label_set_names):
    i['LEAK_SET'] = k
    j['CLOT_SET'] = k



In [0]:
concat_leak = pd.concat(leak_sets)
concat_clot = pd.concat(clot_sets)


 test to make sure the data is organized as anticipated. we are looking for where the val_2 set transitions into the test set.

 * uncomment the following two cells to see the pertinent portions of these tables

In [0]:
#concat_leak[len(leak_sets[0])*3 - 2:len(leak_sets[0])*3 + 2]



In [0]:
#concat_clot[len(clot_sets[0])*3 - 2:len(clot_sets[0])*3 + 2]



In [0]:
df_main = pd.merge(concat_leak, concat_clot[['CLOT', 'CLOT_SET']], left_index=True, right_index=True)



In [0]:
len(df_main.columns)


 re-index

In [0]:
df_main = df_main.reset_index(drop=True)



In [0]:
len(df_main)


 save

In [0]:
df_main.to_csv(f'{PATH}/study_data_split.csv')


