# Cherry pick
Generate the test prediction files using cherry picking from ensembles of feature predictions

In [1]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')

import os
import time
import getpass

import glob

import numpy as np
logging.info("Numpy version " + np.__version__)
import scipy as sp
import scipy.signal as sig
logging.info("Scipy version " + sp.__version__)
import pandas as pd
logging.info("Pandas version " + pd.__version__)
import matplotlib
import matplotlib.pyplot as plt
logging.info("Matplotlib version " + matplotlib.__version__)

%load_ext autoreload
%autoreload 2

# For the standard seed use 34
np.random.seed (34)

INFO: 2020-05-13 20:12:16,864: Numpy version 1.18.2
INFO: 2020-05-13 20:12:16,989: Scipy version 1.4.1
INFO: 2020-05-13 20:12:17,079: Pandas version 1.0.2
INFO: 2020-05-13 20:12:17,191: Matplotlib version 3.2.0


### Define parameters

In [2]:
datasets = ['cis','real']
score_names = ['tremor', 'dyskinesia', 'on_off']

labels_folder = 'data'

### Read label files

In [3]:
training_labels, test_labels, test_data_folder = {}, {}, {}
     
for dataset in datasets:
    test_label_file = os.path.join(labels_folder, dataset.upper()+'-PD_Test_Data_IDs_Labels.csv')
    training_label_file = os.path.join(labels_folder, dataset.upper()+'-PD_Training_Data_IDs_Labels.csv')

    training_labels[dataset] = pd.read_csv(training_label_file)
    logging.info(f'Training {dataset}: Read file "{training_label_file}" containing {training_labels[dataset].shape[0]} records')

    test_labels[dataset] = pd.read_csv(test_label_file)
    logging.info(f'Test {dataset}: Read file "{test_label_file}" containing {test_labels[dataset].shape[0]} records')

INFO: 2020-05-13 20:12:21,818: Training cis: Read file "data/CIS-PD_Training_Data_IDs_Labels.csv" containing 1858 records
INFO: 2020-05-13 20:12:21,822: Test cis: Read file "data/CIS-PD_Test_Data_IDs_Labels.csv" containing 618 records
INFO: 2020-05-13 20:12:21,825: Training real: Read file "data/REAL-PD_Training_Data_IDs_Labels.csv" containing 591 records
INFO: 2020-05-13 20:12:21,828: Test real: Read file "data/REAL-PD_Test_Data_IDs_Labels.csv" containing 191 records


In [4]:
train_df, test_df = {}, {}
subjects = {}
for score in score_names:
    field_cols = ['measurement_id', 'subject_id', score]
    train_df[score] = pd.concat([training_labels['cis'][field_cols], training_labels['real'][field_cols]]).dropna()
    test_df[score] = pd.concat([test_labels['cis'][field_cols], test_labels['real'][field_cols]]).dropna()
    subjects[score] = train_df[score]['subject_id'].unique()
    logging.info(f'Score: {score}: unique subjects {subjects[score].shape[0]}:\n{subjects[score]}')
    for subject in subjects[score]:
        test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'] = train_df[score][train_df[score]['subject_id']==subject][score].mean()
        test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_std'] = train_df[score][train_df[score]['subject_id']==subject][score].std()

    test_df[score].set_index('measurement_id', inplace=True)

INFO: 2020-05-13 20:12:24,716: Score: tremor: unique subjects 19:
[1004 1006 1007 1019 1020 1023 1032 1034 1038 1043 1046 1048 1049 'hbv013'
 'hbv038' 'hbv023' 'hbv054' 'hbv022' 'hbv012']
INFO: 2020-05-13 20:12:24,779: Score: dyskinesia: unique subjects 16:
[1004 1007 1019 1023 1034 1038 1039 1043 1044 1048 1049 'hbv013' 'hbv017'
 'hbv054' 'hbv018' 'hbv043']
INFO: 2020-05-13 20:12:24,832: Score: on_off: unique subjects 22:
[1004 1006 1007 1019 1020 1023 1032 1034 1038 1039 1043 1044 1048 1049
 1051 'hbv013' 'hbv038' 'hbv051' 'hbv077' 'hbv014' 'hbv043' 'hbv022']


### Read features scores

In [5]:
for score in score_names:
    feature_results = sorted(glob.glob(f'features/Submit_Final/*{score}_*.csv'))
    logging.info(f'Score {score} - number of feature files {len(feature_results)}')
    for i, file_name in enumerate(feature_results):
        d = pd.read_csv(file_name).set_index('measurement_id')
        test_df[score][f'f{i}'] = d['prediction']
        idx = test_df[score][f'f{i}'].isna() & ~test_df[score]['naive_mean'].isna()
        test_df[score].loc[idx,f'f{i}'] = test_df[score].loc[idx,'naive_mean']      

INFO: 2020-05-13 20:12:27,882: Score tremor - number of feature files 32
INFO: 2020-05-13 20:12:28,000: Score dyskinesia - number of feature files 32
INFO: 2020-05-13 20:12:28,114: Score on_off - number of feature files 32


### Generate a single dataframe

In [6]:
for score in score_names:
    feature_cols = test_df[score].filter(regex='^f')
    test_df[score]['f_mean'] = feature_cols.mean(axis=1)
#     test_df[score]['f_std'] = feature_cols.std(axis=1)
#     test_df[score]['f_median'] = feature_cols.median(axis=1)
    for i, subject in enumerate(subjects[score]):
        for feature_col in feature_cols:
            test_df[score].loc[test_df[score]['subject_id']==subject,'a'+feature_col[1:]] = (
                test_df[score].loc[test_df[score]['subject_id']==subject, feature_col] - 
                test_df[score].loc[test_df[score]['subject_id']==subject, feature_col].mean() +
                test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'].mean() )
    
    adjusted_cols = test_df[score].filter(regex='^a')
    test_df[score]['a_mean'] = adjusted_cols.mean(axis=1)
#     test_df[score]['a_std'] = adjusted_cols.std(axis=1)
#     test_df[score]['a_median'] = adjusted_cols.median(axis=1)
    
    #idx = test_df[score][f'a_mean'].isna() & ~test_df[score]['naive_mean'].isna()
    #test_df[score].loc[idx,f'a_mean'] = test_df[score].loc[idx,'naive_mean']      


In [7]:
for score in score_names:
    print(test_df[score]['subject_id'].unique())

[1004 1006 1007 1019 1020 1023 1032 1034 1038 1039 1043 1044 1046 1048
 1049 1051 'hbv013' 'hbv038' 'hbv017' 'hbv023' 'hbv051' 'hbv077' 'hbv054'
 'hbv014' 'hbv018' 'hbv043' 'hbv022' 'hbv012']
[1004 1006 1007 1019 1020 1023 1032 1034 1038 1039 1043 1044 1046 1048
 1049 1051 'hbv013' 'hbv038' 'hbv017' 'hbv023' 'hbv051' 'hbv077' 'hbv054'
 'hbv014' 'hbv018' 'hbv043' 'hbv022' 'hbv012']
[1004 1006 1007 1019 1020 1023 1032 1034 1038 1039 1043 1044 1046 1048
 1049 1051 'hbv013' 'hbv038' 'hbv017' 'hbv023' 'hbv051' 'hbv077' 'hbv054'
 'hbv014' 'hbv018' 'hbv043' 'hbv022' 'hbv012']


In [8]:
naive_subjects = {}
naive_subjects['tremor'] = [1007, 1023, 1034, 1046, 'hbv038', 'hbv054', 'hbv012']
naive_subjects['dyskinesia'] = [1007, 1023, 1034, 1043, 1044, 1048, 'hbv054', 'hbv018']
naive_subjects['on_off'] = [1006, 1044, 'hbv013', 'hbv038', 'hbv051', 'hbv077', 'hbv043']



for score in score_names:
    for naive_subject in naive_subjects[score]:
        test_df[score].loc[test_df[score]['subject_id'] == naive_subject,'a_mean'] =  test_df[score].loc[test_df[score]['subject_id'] == naive_subject,'naive_mean'] 

In [9]:
test_df['tremor'][['naive_mean','a_mean', 'subject_id']].sample(10)

Unnamed: 0_level_0,naive_mean,a_mean,subject_id
measurement_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2e2c36d8-62ce-4c14-bfa7-97793d7bf418,,,hbv017
0b8a1616-5c4b-451f-9341-1637bef7587b,2.130435,1.921431,hbv023
dad4ab3e-89cb-4ecd-80c4-c0da16688857,,,hbv051
7a806e8e-6191-4b77-87e8-362bb37872e7,0.327759,0.327759,1007
e19233ad-5568-43b2-8a3f-d989dd2bea52,0.418079,0.679263,1032
6877b35f-474a-41b6-ab7b-d9fdcd3155ca,1.0,0.460486,1004
bcddfca4-138b-420a-a45b-8ae5871214ac,0.349057,0.349057,1023
a66069e9-d744-4ffb-8f4b-5df7b3ca5822,0.373134,0.17638,hbv013
ddb8178e-9b5d-4146-9941-bb24e1161c2c,1.347826,1.187425,1038
e854ed3a-59f3-44c0-85db-1f4517824f70,,,1051


In [10]:
submit = {}
                    
for score in score_names:
    f=glob.glob(f'features/BEAT-PD_SC*_{score}_Submit_Final.csv')
    submit[score] = pd.read_csv(f[0])
    logging.info(f'Submission file {submit[score].shape[0]} records')
    s = submit[score].set_index('measurement_id')
    s['prediction'] = test_df[score].loc[s.index, 'a_mean']        
    
    s.loc[s['prediction']<0,'prediction']=0
#     s['prediction'] = alldf.loc[s.index,'naive']

#     s.loc[negindex,'prediction'] = sneg.loc[negindex,['MeanScore','MeanFeature']].mean(axis=1)
#     posindex = s.index.intersection(spos.index)
#     s.loc[posindex,'prediction'] = spos.loc[posindex,['MeanScore','MeanFeature']].mean(axis=1)
    s.to_csv(f[0].replace('.csv','_Predict.csv'))

INFO: 2020-05-13 20:12:54,858: Submission file 588 records
INFO: 2020-05-13 20:12:54,864: Submission file 482 records
INFO: 2020-05-13 20:12:54,869: Submission file 695 records
