# K-Fold cross validation
Generate K sets of training/test pairs from the BEATPD training set
Use stratified k-fold splitting based on "subject_id"

In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')

import os
import pandas as pd

from sklearn.model_selection import StratifiedKFold

In [None]:
datasets = ['CIS', 'REAL']
K = 5
seed = 34

### Generate K training/test files 

In [None]:
outputFolder = 'data'
if not os.path.isdir(outputFolder):
    logging.info(f'Making output dir: {outputFolder}')
    os.mkdir(outputFolder)
    
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for dataset in datasets:
    df = pd.read_csv(os.path.join('..', 'data', f'{dataset}-PD_Training_Data_IDs_Labels.csv'))
    logging.info(f'Dataset: {dataset}\tRecords: {df.shape[0]}')
    for i, (train_index, test_index) in enumerate(skf.split(df, df['subject_id'])):
        logging.info(f'({i})\tTrain: {train_index.shape[0]}\tTest: {test_index.shape[0]}')
        df.loc[train_index].to_csv(os.path.join('data', f'{dataset}-PD_Training_Data_IDs_Labels_S{seed}_K{i}.csv'))
        df.loc[test_index].to_csv(os.path.join('data', f'{dataset}-PD_Test_Data_IDs_Labels_S{seed}_K{i}.csv'))

### Generate K prediction files

In [None]:
outputFolder = 'features'
if not os.path.isdir(outputFolder):
    logging.info(f'Making output dir: {outputFolder}')
    os.mkdir(outputFolder)
    
for kval in range(0,K): 
    cis_seed_file = f'data/CIS-PD_Test_Data_IDs_Labels_S{seed}_K{kval}.csv'
    real_seed_file = f'data/REAL-PD_Test_Data_IDs_Labels_S{seed}_K{kval}.csv'
    cis_csv = pd.read_csv(cis_seed_file)
    logging.info(f'Seed {seed} CIS: {cis_csv.shape[0]} input records in file {cis_seed_file}')
    real_csv = pd.read_csv(real_seed_file)
    logging.info(f'Seed {seed} REAL: {real_csv.shape[0]} input records in file {real_seed_file}')
    beat_df = pd.DataFrame(columns=['measurement_id', 'prediction'])
    beat_df['measurement_id'] = pd.concat([cis_csv['measurement_id'], real_csv['measurement_id']])
    beat_df['prediction'] = ''
    beat_df.to_csv(f'features/BEAT-PD_SC1_on_off_S{seed}_K{kval}.csv')
    beat_df.to_csv(f'features/BEAT-PD_SC2_dyskinesia_S{seed}_K{kval}.csv')
    beat_df.to_csv(f'features/BEAT-PD_SC3_tremor_S{seed}_K{kval}.csv')