# K-Fold cross validation
Generate K sets of training/test pairs from the BEATPD training set
Use stratified k-fold splitting based on "subject_id"

In [1]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')

import os
import pandas as pd

from sklearn.model_selection import StratifiedKFold

In [2]:
datasets = ['CIS', 'REAL']
K = 5
seed = 34

### Generate K training/test files 

In [8]:
outputFolder = 'data'
if not os.path.isdir(outputFolder):
    logging.info(f'Making output dir: {outputFolder}')
    os.mkdir(outputFolder)
    
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for dataset in datasets:
    df = pd.read_csv(os.path.join('..', 'data', f'{dataset}-PD_Training_Data_IDs_Labels.csv'))
    logging.info(f'Dataset: {dataset}\tRecords: {df.shape[0]}')
    for i, (train_index, test_index) in enumerate(skf.split(df, df['subject_id'])):
        logging.info(f'({i})\tTrain: {train_index.shape[0]}\tTest: {test_index.shape[0]}')
        df.loc[train_index].to_csv(os.path.join('data', f'{dataset}-PD_Training_Data_IDs_Labels_S{seed}_K{i}.csv'))
        df.loc[test_index].to_csv(os.path.join('data', f'{dataset}-PD_Test_Data_IDs_Labels_S{seed}_K{i}.csv'))

INFO: 2020-05-17 14:37:46,001: Making output dir: data
INFO: 2020-05-17 14:37:46,008: Dataset: CIS	Records: 1858
INFO: 2020-05-17 14:37:46,010: (0)	Train: 1486	Test: 372
INFO: 2020-05-17 14:37:46,016: (1)	Train: 1486	Test: 372
INFO: 2020-05-17 14:37:46,022: (2)	Train: 1486	Test: 372
INFO: 2020-05-17 14:37:46,028: (3)	Train: 1487	Test: 371
INFO: 2020-05-17 14:37:46,034: (4)	Train: 1487	Test: 371
INFO: 2020-05-17 14:37:46,041: Dataset: REAL	Records: 591
INFO: 2020-05-17 14:37:46,042: (0)	Train: 472	Test: 119
INFO: 2020-05-17 14:37:46,045: (1)	Train: 473	Test: 118
INFO: 2020-05-17 14:37:46,048: (2)	Train: 473	Test: 118
INFO: 2020-05-17 14:37:46,051: (3)	Train: 473	Test: 118
INFO: 2020-05-17 14:37:46,054: (4)	Train: 473	Test: 118


### Generate K prediction files

In [9]:
outputFolder = 'features'
if not os.path.isdir(outputFolder):
    logging.info(f'Making output dir: {outputFolder}')
    os.mkdir(outputFolder)
    
for kval in range(0,K): 
    cis_seed_file = f'data/CIS-PD_Test_Data_IDs_Labels_S{seed}_K{kval}.csv'
    real_seed_file = f'data/REAL-PD_Test_Data_IDs_Labels_S{seed}_K{kval}.csv'
    cis_csv = pd.read_csv(cis_seed_file)
    logging.info(f'Seed {seed} CIS: {cis_csv.shape[0]} input records in file {cis_seed_file}')
    real_csv = pd.read_csv(real_seed_file)
    logging.info(f'Seed {seed} REAL: {real_csv.shape[0]} input records in file {real_seed_file}')
    beat_df = pd.DataFrame(columns=['measurement_id', 'prediction'])
    beat_df['measurement_id'] = pd.concat([cis_csv['measurement_id'], real_csv['measurement_id']])
    beat_df['prediction'] = ''
    beat_df.to_csv(f'features/BEAT-PD_SC1_on_off_S{seed}_K{kval}.csv')
    beat_df.to_csv(f'features/BEAT-PD_SC2_dyskinesia_S{seed}_K{kval}.csv')
    beat_df.to_csv(f'features/BEAT-PD_SC3_tremor_S{seed}_K{kval}.csv')

INFO: 2020-05-17 14:38:26,468: Making output dir: features
INFO: 2020-05-17 14:38:26,471: Seed 34 CIS: 372 input records in file data/CIS-PD_Test_Data_IDs_Labels_S34_K0.csv
INFO: 2020-05-17 14:38:26,473: Seed 34 REAL: 119 input records in file data/REAL-PD_Test_Data_IDs_Labels_S34_K0.csv
INFO: 2020-05-17 14:38:26,481: Seed 34 CIS: 372 input records in file data/CIS-PD_Test_Data_IDs_Labels_S34_K1.csv
INFO: 2020-05-17 14:38:26,483: Seed 34 REAL: 118 input records in file data/REAL-PD_Test_Data_IDs_Labels_S34_K1.csv
INFO: 2020-05-17 14:38:26,490: Seed 34 CIS: 372 input records in file data/CIS-PD_Test_Data_IDs_Labels_S34_K2.csv
INFO: 2020-05-17 14:38:26,491: Seed 34 REAL: 118 input records in file data/REAL-PD_Test_Data_IDs_Labels_S34_K2.csv
INFO: 2020-05-17 14:38:26,498: Seed 34 CIS: 371 input records in file data/CIS-PD_Test_Data_IDs_Labels_S34_K3.csv
INFO: 2020-05-17 14:38:26,500: Seed 34 REAL: 118 input records in file data/REAL-PD_Test_Data_IDs_Labels_S34_K3.csv
INFO: 2020-05-17 14:3