# Cherry pick
Generate the test prediction files using cherry picking from ensembles of feature predictions

In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')

import os
import time
import getpass

import glob

import numpy as np
logging.info("Numpy version " + np.__version__)
import scipy as sp
import scipy.signal as sig
logging.info("Scipy version " + sp.__version__)
import pandas as pd
logging.info("Pandas version " + pd.__version__)
import matplotlib
import matplotlib.pyplot as plt
logging.info("Matplotlib version " + matplotlib.__version__)

%load_ext autoreload
%autoreload 2

%matplotlib inline

# For the standard seed use 34
np.random.seed (34)

### Define parameters

In [None]:
datasets = ['cis','real']
score_names = ['tremor', 'dyskinesia', 'on_off']

labels_folder = 'data'
Ks = [0,1,2,3,4]
test_rand = 'S34_K'

dropNaive = True
onlyGood = False

In [None]:
def BEATPD_loss(df, real_score, est_score):
        subject_mse = {}
        subject_countsq = {}
        subject_mul = {}

        for subject in subjects[score]:
            idx = (df['subject_id']==subject)
            if (~df.loc[idx, real_score].isna().all()):
                subject_mse[subject] = ((df.loc[idx, real_score]-df.loc[idx, est_score])**2).mean()
                subject_countsq[subject] = np.sqrt(idx.sum())
                subject_mul[subject] = subject_mse[subject] * subject_countsq[subject]
        loss = sum(subject_mul.values()) / sum(subject_countsq.values())
        #logging.info(f'BEATPD {loss:.2f}')
        return loss

def kloss(df, real_score, est_score):
        loss = {}
        for subject in subjects[score]:
            idx = (df['subject_id']==subject)
            if (~df.loc[idx, real_score].isna().all()):
                loss[subject] = ((df.loc[idx, real_score]-df.loc[idx, est_score])**2).mean()
        return loss
    

### Define good and naive subjects

In [None]:
parms  = { }
for K in Ks:
    parms [K] = {}
    
    logging.info(f'K iterator {K}')
    training_labels, test_labels, test_data_folder = {}, {}, {}
     
    for dataset in datasets:
        test_label_file = os.path.join(labels_folder, dataset.upper()+f'-PD_Test_Data_IDs_Labels_{test_rand}{K}.csv')
        training_label_file = os.path.join(labels_folder, dataset.upper()+f'-PD_Training_Data_IDs_Labels_{test_rand}{K}.csv')

        training_labels[dataset] = pd.read_csv(training_label_file)
        logging.info(f'Training {dataset}: Read file "{training_label_file}" containing {training_labels[dataset].shape[0]} records')

        test_labels[dataset] = pd.read_csv(test_label_file)
        logging.info(f'Test {dataset}: Read file "{test_label_file}" containing {test_labels[dataset].shape[0]} records')
        
    train_df, test_df = {}, {}
    subjects, test_subject_sq = {}, {}
    for score in score_names:
        field_cols = ['measurement_id', 'subject_id', score]
        train_df[score] = pd.concat([training_labels['cis'][field_cols], training_labels['real'][field_cols]]).dropna(subset=[score])
        test_df[score] = pd.concat([test_labels['cis'][field_cols], test_labels['real'][field_cols]]).dropna(subset=[score])
        subjects[score] = train_df[score]['subject_id'].unique()
        logging.info(f'Score: {score}: unique subjects {subjects[score].shape[0]}:\n{subjects[score]}')
        test_subject_sq[score] = [np.sqrt((test_df[score]['subject_id']==subject).sum()) for subject in subjects]
        for subject in subjects[score]:
            test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'] = train_df[score][train_df[score]['subject_id']==subject][score].mean()
            test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_std'] = train_df[score][train_df[score]['subject_id']==subject][score].std()
        test_df[score].set_index('measurement_id', inplace=True)

    for score in score_names:        
        feature_results = sorted(glob.glob(f'features/{test_rand}{K}/*{score}_*.csv'))
        logging.info(f'Score {score} - number of feature files {len(feature_results)}')
        for i, file_name in enumerate(feature_results):
            d = pd.read_csv(file_name).set_index('measurement_id')
            test_df[score][f'f{i}'] = d['prediction']
            idx = test_df[score][f'f{i}'].isna() & ~test_df[score]['naive_mean'].isna()
            test_df[score].loc[idx,f'f{i}'] = test_df[score].loc[idx,'naive_mean']      

    ### Generate a single dataframe

    for j, score in enumerate(score_names):
        feature_cols = test_df[score].filter(regex='^f')
        test_df[score]['f_mean'] = feature_cols.mean(axis=1)
        test_df[score]['f_std'] = feature_cols.std(axis=1)
        test_df[score]['f_median'] = feature_cols.median(axis=1)
        for i, subject in enumerate(subjects[score]):
            for feature_col in feature_cols:
                test_df[score].loc[test_df[score]['subject_id']==subject,'a'+feature_col[1:]] = (
                    test_df[score].loc[test_df[score]['subject_id']==subject, feature_col] - 
                    test_df[score].loc[test_df[score]['subject_id']==subject, feature_col].mean() +
                    test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'].mean() )

        adjusted_cols = test_df[score].filter(regex='^a')
        test_df[score]['a_mean'] = adjusted_cols.mean(axis=1)
        test_df[score]['a_std'] = adjusted_cols.std(axis=1)
        test_df[score]['a_median'] = adjusted_cols.median(axis=1)

    score_df = pd.DataFrame()
    for score in score_names:
        score_df.loc['naive', score] = BEATPD_loss(test_df[score],score,'naive_mean')
        score_df.loc['f_mean', score] = BEATPD_loss(test_df[score],score,'f_mean')
        score_df.loc['a_mean', score] = BEATPD_loss(test_df[score],score,'a_mean')
        
        parms[K][(score, 'score', 'naive')] = kloss(test_df[score],score,'naive_mean')
        parms[K][(score, 'score', 'ayala')] = kloss(test_df[score],score,'a_mean')

        negfactor=0
        posfactor=0
        sneg=test_df[score][(test_df[score]['a_mean']+negfactor*test_df[score]['a_std']<test_df[score]['naive_mean'])  ]
        spos=test_df[score][(test_df[score]['a_mean']-posfactor*test_df[score]['a_std']>test_df[score]['naive_mean'])  ]        

        logging.info(f'Negative {sneg.shape[0]} Positive {spos.shape[0]} of {test_df[score].shape[0]}')
        logging.info(f"Neg {(sneg[score]<sneg['naive_mean']).sum()} of {sneg.shape[0]} {100*(sneg[score]<sneg['naive_mean']).sum()/sneg.shape[0]:.1f}%")
        logging.info(f"Pos {(spos[score]>spos['naive_mean']).sum()} of {spos.shape[0]} {100*(spos[score]>spos['naive_mean']).sum()/spos.shape[0]:.1f}%")
        test_df[score]['cherry'] = test_df[score]['naive_mean']
        #test_df[score].loc[sneg.index, 'cherry'] = sneg['a_mean'] + (sneg['a_mean']-sneg['naive_mean']) * 0.5
        #test_df[score].loc[spos.index, 'cherry'] = spos['a_mean'] + (spos['a_mean']-spos['naive_mean']) * 0.5
        test_df[score].loc[sneg.index, 'cherry'] = sneg['a_mean'] 
        test_df[score].loc[spos.index, 'cherry'] = spos['a_mean'] 
        score_df.loc['cherry', score] = BEATPD_loss(test_df[score],score,'cherry')
    parms[K]['score'] = score_df

In [None]:
sss = list(training_labels['cis']['subject_id'].unique()) + list(training_labels['real']['subject_id'].unique())
da = pd.DataFrame(index=sss)
good_df = pd.DataFrame()
naive_df = pd.DataFrame()
for score in score_names:
    for K in Ks:
        da[score+str(K)] = pd.Series(parms[K][(score,'score','naive')]) - pd.Series(parms[K][(score,'score','ayala')])  
    dd = da.filter(regex=f'{score}')
    good_df[score] = dd.mean(axis=1)>0.0
    naive_df[score] = dd.mean(axis=1)<0.0

In [None]:
for score in score_names:
    print(score)
    print(naive_df[naive_df[score]].index)

In [None]:
da

### Read label files

In [None]:
kscore = {}
parms  = { }
for K in Ks:
    parms [K] = {}
    
    logging.info(f'K iterator {K}')
    training_labels, test_labels, test_data_folder = {}, {}, {}
     
    for dataset in datasets:
        labels_folder = 'data'
        test_label_file = os.path.join(labels_folder, dataset.upper()+f'-PD_Test_Data_IDs_Labels_{test_rand}{K}.csv')
        training_label_file = os.path.join(labels_folder, dataset.upper()+f'-PD_Training_Data_IDs_Labels_{test_rand}{K}.csv')

        training_labels[dataset] = pd.read_csv(training_label_file)
        logging.info(f'Training {dataset}: Read file "{training_label_file}" containing {training_labels[dataset].shape[0]} records')

        test_labels[dataset] = pd.read_csv(test_label_file)
        logging.info(f'Test {dataset}: Read file "{test_label_file}" containing {test_labels[dataset].shape[0]} records')
        
    train_df, test_df = {}, {}
    subjects, test_subject_sq = {}, {}
    for score in score_names:
        field_cols = ['measurement_id', 'subject_id', score]
        train_df[score] = pd.concat([training_labels['cis'][field_cols], training_labels['real'][field_cols]]).dropna(subset=[score])
        test_df[score] = pd.concat([test_labels['cis'][field_cols], test_labels['real'][field_cols]]).dropna(subset=[score])
        subjects[score] = train_df[score]['subject_id'].unique()
        logging.info(f'Score: {score}: unique subjects {subjects[score].shape[0]}:\n{subjects[score]}')
        test_subject_sq[score] = [np.sqrt((test_df[score]['subject_id']==subject).sum()) for subject in subjects]
        for subject in subjects[score]:
            test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'] = train_df[score][train_df[score]['subject_id']==subject][score].mean()
            test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_std'] = train_df[score][train_df[score]['subject_id']==subject][score].std()
        test_df[score].set_index('measurement_id', inplace=True)

    for score in score_names:        
        feature_results = sorted(glob.glob(f'features/{test_rand}{K}/*{score}_*.csv'))
        logging.info(f'Score {score} - number of feature files {len(feature_results)}')
        for i, file_name in enumerate(feature_results):
            d = pd.read_csv(file_name).set_index('measurement_id')
            test_df[score][f'f{i}'] = d['prediction']
            idx = test_df[score][f'f{i}'].isna() & ~test_df[score]['naive_mean'].isna()
            test_df[score].loc[idx,f'f{i}'] = test_df[score].loc[idx,'naive_mean']      

    ### Generate a single dataframe

    for j, score in enumerate(score_names):
        feature_cols = test_df[score].filter(regex='^f')
        test_df[score]['f_mean'] = feature_cols.mean(axis=1)
        test_df[score]['f_std'] = feature_cols.std(axis=1)
        test_df[score]['f_median'] = feature_cols.median(axis=1)
        for i, subject in enumerate(subjects[score]):
            for feature_col in feature_cols:
                test_df[score].loc[test_df[score]['subject_id']==subject,'a'+feature_col[1:]] = (
                    test_df[score].loc[test_df[score]['subject_id']==subject, feature_col] - 
                    test_df[score].loc[test_df[score]['subject_id']==subject, feature_col].mean() +
                    test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'].mean() )

        adjusted_cols = test_df[score].filter(regex='^a')
        test_df[score]['a_mean'] = adjusted_cols.mean(axis=1)
        test_df[score]['a_std'] = adjusted_cols.std(axis=1)
        test_df[score]['a_median'] = adjusted_cols.median(axis=1)

    score_df = pd.DataFrame()
    for score in score_names:
        score_df.loc['naive', score] = BEATPD_loss(test_df[score],score,'naive_mean')
        score_df.loc['f_mean', score] = BEATPD_loss(test_df[score],score,'f_mean')
        score_df.loc['a_mean', score] = BEATPD_loss(test_df[score],score,'a_mean')
        
        parms[K][(score, 'score', 'naive')] = kloss(test_df[score],score,'naive_mean')
        parms[K][(score, 'score', 'ayala')] = kloss(test_df[score],score,'a_mean')

        negfactor=0
        posfactor=0
        sneg=test_df[score][(test_df[score]['a_mean']+negfactor*test_df[score]['a_std']<test_df[score]['naive_mean'])  ]
        spos=test_df[score][(test_df[score]['a_mean']-posfactor*test_df[score]['a_std']>test_df[score]['naive_mean'])  ]
        
        if dropNaive:
            naive = naive_df[naive_df[score]].index.tolist()
            sneg = sneg[~sneg['subject_id'].isin(naive)]
            spos = spos[~spos['subject_id'].isin(naive)]
        if onlyGood:
            good = good_df[good_df[score]].index.tolist()
            sneg = sneg[sneg['subject_id'].isin(good)]
            spos = spos[spos['subject_id'].isin(good)]

        logging.info(f'Negative {sneg.shape[0]} Positive {spos.shape[0]} of {test_df[score].shape[0]}')
        logging.info(f"Neg {(sneg[score]<sneg['naive_mean']).sum()} of {sneg.shape[0]} {100*(sneg[score]<sneg['naive_mean']).sum()/sneg.shape[0]:.1f}%")
        logging.info(f"Pos {(spos[score]>spos['naive_mean']).sum()} of {spos.shape[0]} {100*(spos[score]>spos['naive_mean']).sum()/spos.shape[0]:.1f}%")
        test_df[score]['cherry'] = test_df[score]['naive_mean']
        #test_df[score].loc[sneg.index, 'cherry'] = sneg['a_mean'] + (sneg['a_mean']-sneg['naive_mean']) * 0.5
        #test_df[score].loc[spos.index, 'cherry'] = spos['a_mean'] + (spos['a_mean']-spos['naive_mean']) * 0.5
        test_df[score].loc[sneg.index, 'cherry'] = sneg['a_mean'] 
        test_df[score].loc[spos.index, 'cherry'] = spos['a_mean'] 
        score_df.loc['cherry', score] = BEATPD_loss(test_df[score],score,'cherry')
    parms[K]['score'] = score_df

In [None]:
for K in Ks:
    print(parms[K]['score'])