# Cherry pick
Generate the test prediction files using cherry picking from ensembles of feature predictions

In [1]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')

import os
import time
import getpass

import glob

import numpy as np
logging.info("Numpy version " + np.__version__)
import scipy as sp
import scipy.signal as sig
logging.info("Scipy version " + sp.__version__)
import pandas as pd
logging.info("Pandas version " + pd.__version__)
import matplotlib
import matplotlib.pyplot as plt
logging.info("Matplotlib version " + matplotlib.__version__)

%load_ext autoreload
%autoreload 2

%matplotlib inline

# For the standard seed use 34
np.random.seed (34)

INFO: 2020-05-18 15:16:26,936: Numpy version 1.18.2
INFO: 2020-05-18 15:16:27,072: Scipy version 1.4.1
INFO: 2020-05-18 15:16:27,166: Pandas version 1.0.3
INFO: 2020-05-18 15:16:27,282: Matplotlib version 3.2.0


### Define parameters

In [2]:
datasets = ['cis','real']
score_names = ['tremor', 'dyskinesia', 'on_off']

labels_folder = 'data'
Ks = [0,1,2,3,4]
test_rand = 'S34_K'

dropNaive = True
onlyGood = False

In [3]:
def BEATPD_loss(df, real_score, est_score):
        subject_mse = {}
        subject_countsq = {}
        subject_mul = {}

        for subject in subjects[score]:
            idx = (df['subject_id']==subject)
            if (~df.loc[idx, real_score].isna().all()):
                subject_mse[subject] = ((df.loc[idx, real_score]-df.loc[idx, est_score])**2).mean()
                subject_countsq[subject] = np.sqrt(idx.sum())
                subject_mul[subject] = subject_mse[subject] * subject_countsq[subject]
        loss = sum(subject_mul.values()) / sum(subject_countsq.values())
        #logging.info(f'BEATPD {loss:.2f}')
        return loss

def kloss(df, real_score, est_score):
        loss = {}
        for subject in subjects[score]:
            idx = (df['subject_id']==subject)
            if (~df.loc[idx, real_score].isna().all()):
                loss[subject] = ((df.loc[idx, real_score]-df.loc[idx, est_score])**2).mean()
        return loss
    

### Define good and naive subjects

In [4]:
parms  = { }
for K in Ks:
    parms [K] = {}
    
    logging.info(f'K iterator {K}')
    training_labels, test_labels, test_data_folder = {}, {}, {}
     
    for dataset in datasets:
        test_label_file = os.path.join(labels_folder, dataset.upper()+f'-PD_Test_Data_IDs_Labels_{test_rand}{K}.csv')
        training_label_file = os.path.join(labels_folder, dataset.upper()+f'-PD_Training_Data_IDs_Labels_{test_rand}{K}.csv')

        training_labels[dataset] = pd.read_csv(training_label_file)
        logging.info(f'Training {dataset}: Read file "{training_label_file}" containing {training_labels[dataset].shape[0]} records')

        test_labels[dataset] = pd.read_csv(test_label_file)
        logging.info(f'Test {dataset}: Read file "{test_label_file}" containing {test_labels[dataset].shape[0]} records')
        
    train_df, test_df = {}, {}
    subjects, test_subject_sq = {}, {}
    for score in score_names:
        field_cols = ['measurement_id', 'subject_id', score]
        train_df[score] = pd.concat([training_labels['cis'][field_cols], training_labels['real'][field_cols]]).dropna(subset=[score])
        test_df[score] = pd.concat([test_labels['cis'][field_cols], test_labels['real'][field_cols]]).dropna(subset=[score])
        subjects[score] = train_df[score]['subject_id'].unique()
        logging.info(f'Score: {score}: unique subjects {subjects[score].shape[0]}:\n{subjects[score]}')
        test_subject_sq[score] = [np.sqrt((test_df[score]['subject_id']==subject).sum()) for subject in subjects]
        for subject in subjects[score]:
            test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'] = train_df[score][train_df[score]['subject_id']==subject][score].mean()
            test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_std'] = train_df[score][train_df[score]['subject_id']==subject][score].std()
        test_df[score].set_index('measurement_id', inplace=True)

    for score in score_names:        
        feature_results = sorted(glob.glob(f'features/{test_rand}{K}/*{score}_*.csv'))
        logging.info(f'Score {score} - number of feature files {len(feature_results)}')
        for i, file_name in enumerate(feature_results):
            d = pd.read_csv(file_name).set_index('measurement_id')
            test_df[score][f'f{i}'] = d['prediction']
            idx = test_df[score][f'f{i}'].isna() & ~test_df[score]['naive_mean'].isna()
            test_df[score].loc[idx,f'f{i}'] = test_df[score].loc[idx,'naive_mean']      

    ### Generate a single dataframe

    for j, score in enumerate(score_names):
        feature_cols = test_df[score].filter(regex='^f')
        test_df[score]['f_mean'] = feature_cols.mean(axis=1)
        test_df[score]['f_std'] = feature_cols.std(axis=1)
        test_df[score]['f_median'] = feature_cols.median(axis=1)
        for i, subject in enumerate(subjects[score]):
            for feature_col in feature_cols:
                test_df[score].loc[test_df[score]['subject_id']==subject,'a'+feature_col[1:]] = (
                    test_df[score].loc[test_df[score]['subject_id']==subject, feature_col] - 
                    test_df[score].loc[test_df[score]['subject_id']==subject, feature_col].mean() +
                    test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'].mean() )

        adjusted_cols = test_df[score].filter(regex='^a')
        test_df[score]['a_mean'] = adjusted_cols.mean(axis=1)
        test_df[score]['a_std'] = adjusted_cols.std(axis=1)
        test_df[score]['a_median'] = adjusted_cols.median(axis=1)

    score_df = pd.DataFrame()
    for score in score_names:
        score_df.loc['naive', score] = BEATPD_loss(test_df[score],score,'naive_mean')
        score_df.loc['f_mean', score] = BEATPD_loss(test_df[score],score,'f_mean')
        score_df.loc['a_mean', score] = BEATPD_loss(test_df[score],score,'a_mean')
        
        parms[K][(score, 'score', 'naive')] = kloss(test_df[score],score,'naive_mean')
        parms[K][(score, 'score', 'ayala')] = kloss(test_df[score],score,'a_mean')

        negfactor=0
        posfactor=0
        sneg=test_df[score][(test_df[score]['a_mean']+negfactor*test_df[score]['a_std']<test_df[score]['naive_mean'])  ]
        spos=test_df[score][(test_df[score]['a_mean']-posfactor*test_df[score]['a_std']>test_df[score]['naive_mean'])  ]        

        logging.info(f'Negative {sneg.shape[0]} Positive {spos.shape[0]} of {test_df[score].shape[0]}')
        logging.info(f"Neg {(sneg[score]<sneg['naive_mean']).sum()} of {sneg.shape[0]} {100*(sneg[score]<sneg['naive_mean']).sum()/sneg.shape[0]:.1f}%")
        logging.info(f"Pos {(spos[score]>spos['naive_mean']).sum()} of {spos.shape[0]} {100*(spos[score]>spos['naive_mean']).sum()/spos.shape[0]:.1f}%")
        test_df[score]['cherry'] = test_df[score]['naive_mean']
        #test_df[score].loc[sneg.index, 'cherry'] = sneg['a_mean'] + (sneg['a_mean']-sneg['naive_mean']) * 0.5
        #test_df[score].loc[spos.index, 'cherry'] = spos['a_mean'] + (spos['a_mean']-spos['naive_mean']) * 0.5
        test_df[score].loc[sneg.index, 'cherry'] = sneg['a_mean'] 
        test_df[score].loc[spos.index, 'cherry'] = spos['a_mean'] 
        score_df.loc['cherry', score] = BEATPD_loss(test_df[score],score,'cherry')
    parms[K]['score'] = score_df

INFO: 2020-05-18 15:16:37,383: K iterator 0
INFO: 2020-05-18 15:16:37,387: Training cis: Read file "data/CIS-PD_Training_Data_IDs_Labels_S34_K0.csv" containing 1486 records
INFO: 2020-05-18 15:16:37,389: Test cis: Read file "data/CIS-PD_Test_Data_IDs_Labels_S34_K0.csv" containing 372 records
INFO: 2020-05-18 15:16:37,391: Training real: Read file "data/REAL-PD_Training_Data_IDs_Labels_S34_K0.csv" containing 472 records
INFO: 2020-05-18 15:16:37,392: Test real: Read file "data/REAL-PD_Test_Data_IDs_Labels_S34_K0.csv" containing 119 records
INFO: 2020-05-18 15:16:37,399: Score: tremor: unique subjects 19:
[1004 1006 1007 1019 1020 1023 1032 1034 1038 1043 1046 1048 1049 'hbv013'
 'hbv038' 'hbv023' 'hbv054' 'hbv022' 'hbv012']
INFO: 2020-05-18 15:16:37,461: Score: dyskinesia: unique subjects 16:
[1004 1007 1019 1023 1034 1038 1039 1043 1044 1048 1049 'hbv013' 'hbv017'
 'hbv054' 'hbv018' 'hbv043']
INFO: 2020-05-18 15:16:37,513: Score: on_off: unique subjects 22:
[1004 1006 1007 1019 1020 10

In [5]:
sss = list(training_labels['cis']['subject_id'].unique()) + list(training_labels['real']['subject_id'].unique())
da = pd.DataFrame(index=sss)
good_df = pd.DataFrame()
naive_df = pd.DataFrame()
for score in score_names:
    for K in Ks:
        da[score+str(K)] = pd.Series(parms[K][(score,'score','naive')]) - pd.Series(parms[K][(score,'score','ayala')])  
    dd = da.filter(regex=f'{score}')
    good_df[score] = dd.mean(axis=1)>0.0
    naive_df[score] = dd.mean(axis=1)<0.0

In [6]:
for score in score_names:
    print(score)
    print(naive_df[naive_df[score]].index)

tremor
Index([1007, 1023, 1034, 1046, 1048, 'hbv054', 'hbv012'], dtype='object')
dyskinesia
Index([1007, 1023, 1034, 1043, 1044, 1048, 'hbv054', 'hbv018'], dtype='object')
on_off
Index([1006, 1044, 'hbv013', 'hbv051', 'hbv077', 'hbv043'], dtype='object')


In [9]:
da

Unnamed: 0,tremor0,tremor1,tremor2,tremor3,dyskinesia0,dyskinesia1,dyskinesia2,dyskinesia3,on_off0,on_off1,on_off2,on_off3
1004,0.05247,0.407035,0.232054,0.5099222,0.040748,0.19283,0.343903,0.426991,-0.022416,0.264478,0.321505,0.3934834
1006,-0.020648,0.036357,-0.083122,0.08341722,,,,,0.015756,0.005744,-0.045952,-0.007995845
1007,0.012604,-0.002898,-0.02176,-0.001350119,-0.001546,-0.000776,-0.000652,0.000244,0.149825,0.025629,0.088554,-0.01051866
1019,0.233259,-0.045032,0.089754,-0.1213671,0.421769,0.068614,0.210575,-0.153097,0.567448,0.100092,0.268251,0.04578736
1020,0.013509,0.022539,0.031801,-0.002546523,,,,,0.042677,0.014205,0.021498,0.003199716
1023,0.011125,-0.002729,-0.061053,-0.02185325,-0.016623,-0.107225,-0.065756,-0.252136,-0.036632,0.066628,-0.096171,0.2661924
1032,-0.015115,0.037728,0.0092,0.03699515,,,,,-0.024152,0.027688,0.075018,0.08264846
1034,-0.035379,-0.061622,0.029295,0.02172455,-0.047012,-0.012737,-0.079134,-0.042991,0.240171,0.398008,0.075692,-0.1143865
1038,0.039815,-0.038332,-0.024963,0.03651111,0.043232,0.074147,0.009916,0.124211,0.304885,-0.27711,0.242414,0.5729182
1039,,,,,0.083635,0.15142,0.043571,0.098651,0.028309,0.303863,0.058165,0.1714582


### Read label files

In [7]:
kscore = {}
parms  = { }
for K in Ks:
    parms [K] = {}
    
    logging.info(f'K iterator {K}')
    training_labels, test_labels, test_data_folder = {}, {}, {}
     
    for dataset in datasets:
        labels_folder = 'data'
        test_label_file = os.path.join(labels_folder, dataset.upper()+f'-PD_Test_Data_IDs_Labels_{test_rand}{K}.csv')
        training_label_file = os.path.join(labels_folder, dataset.upper()+f'-PD_Training_Data_IDs_Labels_{test_rand}{K}.csv')

        training_labels[dataset] = pd.read_csv(training_label_file)
        logging.info(f'Training {dataset}: Read file "{training_label_file}" containing {training_labels[dataset].shape[0]} records')

        test_labels[dataset] = pd.read_csv(test_label_file)
        logging.info(f'Test {dataset}: Read file "{test_label_file}" containing {test_labels[dataset].shape[0]} records')
        
    train_df, test_df = {}, {}
    subjects, test_subject_sq = {}, {}
    for score in score_names:
        field_cols = ['measurement_id', 'subject_id', score]
        train_df[score] = pd.concat([training_labels['cis'][field_cols], training_labels['real'][field_cols]]).dropna(subset=[score])
        test_df[score] = pd.concat([test_labels['cis'][field_cols], test_labels['real'][field_cols]]).dropna(subset=[score])
        subjects[score] = train_df[score]['subject_id'].unique()
        logging.info(f'Score: {score}: unique subjects {subjects[score].shape[0]}:\n{subjects[score]}')
        test_subject_sq[score] = [np.sqrt((test_df[score]['subject_id']==subject).sum()) for subject in subjects]
        for subject in subjects[score]:
            test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'] = train_df[score][train_df[score]['subject_id']==subject][score].mean()
            test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_std'] = train_df[score][train_df[score]['subject_id']==subject][score].std()
        test_df[score].set_index('measurement_id', inplace=True)

    for score in score_names:        
        feature_results = sorted(glob.glob(f'features/{test_rand}{K}/*{score}_*.csv'))
        logging.info(f'Score {score} - number of feature files {len(feature_results)}')
        for i, file_name in enumerate(feature_results):
            d = pd.read_csv(file_name).set_index('measurement_id')
            test_df[score][f'f{i}'] = d['prediction']
            idx = test_df[score][f'f{i}'].isna() & ~test_df[score]['naive_mean'].isna()
            test_df[score].loc[idx,f'f{i}'] = test_df[score].loc[idx,'naive_mean']      

    ### Generate a single dataframe

    for j, score in enumerate(score_names):
        feature_cols = test_df[score].filter(regex='^f')
        test_df[score]['f_mean'] = feature_cols.mean(axis=1)
        test_df[score]['f_std'] = feature_cols.std(axis=1)
        test_df[score]['f_median'] = feature_cols.median(axis=1)
        for i, subject in enumerate(subjects[score]):
            for feature_col in feature_cols:
                test_df[score].loc[test_df[score]['subject_id']==subject,'a'+feature_col[1:]] = (
                    test_df[score].loc[test_df[score]['subject_id']==subject, feature_col] - 
                    test_df[score].loc[test_df[score]['subject_id']==subject, feature_col].mean() +
                    test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'].mean() )

        adjusted_cols = test_df[score].filter(regex='^a')
        test_df[score]['a_mean'] = adjusted_cols.mean(axis=1)
        test_df[score]['a_std'] = adjusted_cols.std(axis=1)
        test_df[score]['a_median'] = adjusted_cols.median(axis=1)

    score_df = pd.DataFrame()
    for score in score_names:
        score_df.loc['naive', score] = BEATPD_loss(test_df[score],score,'naive_mean')
        score_df.loc['f_mean', score] = BEATPD_loss(test_df[score],score,'f_mean')
        score_df.loc['a_mean', score] = BEATPD_loss(test_df[score],score,'a_mean')
        
        parms[K][(score, 'score', 'naive')] = kloss(test_df[score],score,'naive_mean')
        parms[K][(score, 'score', 'ayala')] = kloss(test_df[score],score,'a_mean')

        negfactor=0
        posfactor=0
        sneg=test_df[score][(test_df[score]['a_mean']+negfactor*test_df[score]['a_std']<test_df[score]['naive_mean'])  ]
        spos=test_df[score][(test_df[score]['a_mean']-posfactor*test_df[score]['a_std']>test_df[score]['naive_mean'])  ]
        
        if dropNaive:
            naive = naive_df[naive_df[score]].index.tolist()
            sneg = sneg[~sneg['subject_id'].isin(naive)]
            spos = spos[~spos['subject_id'].isin(naive)]
        if onlyGood:
            good = good_df[good_df[score]].index.tolist()
            sneg = sneg[sneg['subject_id'].isin(good)]
            spos = spos[spos['subject_id'].isin(good)]

        logging.info(f'Negative {sneg.shape[0]} Positive {spos.shape[0]} of {test_df[score].shape[0]}')
        logging.info(f"Neg {(sneg[score]<sneg['naive_mean']).sum()} of {sneg.shape[0]} {100*(sneg[score]<sneg['naive_mean']).sum()/sneg.shape[0]:.1f}%")
        logging.info(f"Pos {(spos[score]>spos['naive_mean']).sum()} of {spos.shape[0]} {100*(spos[score]>spos['naive_mean']).sum()/spos.shape[0]:.1f}%")
        test_df[score]['cherry'] = test_df[score]['naive_mean']
        #test_df[score].loc[sneg.index, 'cherry'] = sneg['a_mean'] + (sneg['a_mean']-sneg['naive_mean']) * 0.5
        #test_df[score].loc[spos.index, 'cherry'] = spos['a_mean'] + (spos['a_mean']-spos['naive_mean']) * 0.5
        test_df[score].loc[sneg.index, 'cherry'] = sneg['a_mean'] 
        test_df[score].loc[spos.index, 'cherry'] = spos['a_mean'] 
        score_df.loc['cherry', score] = BEATPD_loss(test_df[score],score,'cherry')
    parms[K]['score'] = score_df

INFO: 2020-05-18 15:16:51,185: K iterator 0
INFO: 2020-05-18 15:16:51,188: Training cis: Read file "data/CIS-PD_Training_Data_IDs_Labels_S34_K0.csv" containing 1486 records
INFO: 2020-05-18 15:16:51,190: Test cis: Read file "data/CIS-PD_Test_Data_IDs_Labels_S34_K0.csv" containing 372 records
INFO: 2020-05-18 15:16:51,191: Training real: Read file "data/REAL-PD_Training_Data_IDs_Labels_S34_K0.csv" containing 472 records
INFO: 2020-05-18 15:16:51,193: Test real: Read file "data/REAL-PD_Test_Data_IDs_Labels_S34_K0.csv" containing 119 records
INFO: 2020-05-18 15:16:51,200: Score: tremor: unique subjects 19:
[1004 1006 1007 1019 1020 1023 1032 1034 1038 1043 1046 1048 1049 'hbv013'
 'hbv038' 'hbv023' 'hbv054' 'hbv022' 'hbv012']
INFO: 2020-05-18 15:16:51,260: Score: dyskinesia: unique subjects 16:
[1004 1007 1019 1023 1034 1038 1039 1043 1044 1048 1049 'hbv013' 'hbv017'
 'hbv054' 'hbv018' 'hbv043']
INFO: 2020-05-18 15:16:51,312: Score: on_off: unique subjects 22:
[1004 1006 1007 1019 1020 10

In [8]:
for K in Ks:
    print(parms[K]['score'])

          tremor  dyskinesia    on_off
naive   0.428946    0.418017  1.025418
f_mean  0.426847    0.416707  0.993706
a_mean  0.403580    0.392489  0.936373
cherry  0.383735    0.385744  0.931518
          tremor  dyskinesia    on_off
naive   0.507345    0.390019  0.932581
f_mean  0.477495    0.355379  0.986204
a_mean  0.462056    0.368145  0.913249
cherry  0.458789    0.347137  0.905802
          tremor  dyskinesia    on_off
naive   0.368933    0.406923  0.932095
f_mean  0.341806    0.388815  0.969064
a_mean  0.336872    0.382983  0.869295
cherry  0.330117    0.369730  0.859071
          tremor  dyskinesia    on_off
naive   0.419708    0.415160  1.043055
f_mean  0.408859    0.411902  1.122499
a_mean  0.395437    0.399446  0.948137
cherry  0.393385    0.375855  0.947961
