# Cherry pick
Generate the test prediction files using cherry picking from ensembles of feature predictions

In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')

import os
import time
import getpass

import glob

import numpy as np
logging.info("Numpy version " + np.__version__)
import scipy as sp
import scipy.signal as sig
logging.info("Scipy version " + sp.__version__)
import pandas as pd
logging.info("Pandas version " + pd.__version__)
import matplotlib
import matplotlib.pyplot as plt
logging.info("Matplotlib version " + matplotlib.__version__)

%load_ext autoreload
%autoreload 2

# For the standard seed use 34
np.random.seed (34)

### Define parameters

In [None]:
datasets = ['cis','real']
score_names = ['tremor', 'dyskinesia', 'on_off']

labels_folder = 'data'

### Read label files

In [None]:
training_labels, test_labels, test_data_folder = {}, {}, {}
     
for dataset in datasets:
    test_label_file = os.path.join(labels_folder, dataset.upper()+'-PD_Test_Data_IDs_Labels.csv')
    training_label_file = os.path.join(labels_folder, dataset.upper()+'-PD_Training_Data_IDs_Labels.csv')

    training_labels[dataset] = pd.read_csv(training_label_file)
    logging.info(f'Training {dataset}: Read file "{training_label_file}" containing {training_labels[dataset].shape[0]} records')

    test_labels[dataset] = pd.read_csv(test_label_file)
    logging.info(f'Test {dataset}: Read file "{test_label_file}" containing {test_labels[dataset].shape[0]} records')

In [None]:
train_df, test_df = {}, {}
subjects = {}
for score in score_names:
    field_cols = ['measurement_id', 'subject_id', score]
    train_df[score] = pd.concat([training_labels['cis'][field_cols], training_labels['real'][field_cols]]).dropna()
    test_df[score] = pd.concat([test_labels['cis'][field_cols], test_labels['real'][field_cols]]).dropna()
    subjects[score] = train_df[score]['subject_id'].unique()
    logging.info(f'Score: {score}: unique subjects {subjects[score].shape[0]}:\n{subjects[score]}')
    for subject in subjects[score]:
        test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'] = train_df[score][train_df[score]['subject_id']==subject][score].mean()
        test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_std'] = train_df[score][train_df[score]['subject_id']==subject][score].std()

    test_df[score].set_index('measurement_id', inplace=True)

### Read features scores

In [None]:
for score in score_names:
    feature_results = sorted(glob.glob(f'features/Submit_Final/*{score}_*.csv'))
    logging.info(f'Score {score} - number of feature files {len(feature_results)}')
    for i, file_name in enumerate(feature_results):
        d = pd.read_csv(file_name).set_index('measurement_id')
        test_df[score][f'f{i}'] = d['prediction']
        idx = test_df[score][f'f{i}'].isna() & ~test_df[score]['naive_mean'].isna()
        test_df[score].loc[idx,f'f{i}'] = test_df[score].loc[idx,'naive_mean']      

### Generate a single dataframe

In [None]:
for score in score_names:
    feature_cols = test_df[score].filter(regex='^f')
    test_df[score]['f_mean'] = feature_cols.mean(axis=1)
    for i, subject in enumerate(subjects[score]):
        for feature_col in feature_cols:
            test_df[score].loc[test_df[score]['subject_id']==subject,'a'+feature_col[1:]] = (
                test_df[score].loc[test_df[score]['subject_id']==subject, feature_col] - 
                test_df[score].loc[test_df[score]['subject_id']==subject, feature_col].mean() +
                test_df[score].loc[test_df[score]['subject_id']==subject, 'naive_mean'].mean() )
    
    adjusted_cols = test_df[score].filter(regex='^a')
    test_df[score]['a_mean'] = adjusted_cols.mean(axis=1)

In [None]:
for score in score_names:
    print(test_df[score]['subject_id'].unique())

In [None]:
naive_subjects = {}
naive_subjects['tremor'] = [1007, 1023, 1034, 1046, 'hbv038', 'hbv054', 'hbv012']
naive_subjects['dyskinesia'] = [1007, 1023, 1034, 1043, 1044, 1048, 'hbv054', 'hbv018']
naive_subjects['on_off'] = [1006, 1044, 'hbv013', 'hbv038', 'hbv051', 'hbv077', 'hbv043']

for score in score_names:
    for naive_subject in naive_subjects[score]:
        test_df[score].loc[test_df[score]['subject_id'] == naive_subject,'a_mean'] =  test_df[score].loc[test_df[score]['subject_id'] == naive_subject,'naive_mean'] 

In [None]:
test_df['tremor'][['naive_mean','a_mean', 'subject_id']].sample(10)

In [None]:
submit = {}
                    
for score in score_names:
    f=glob.glob(f'features/BEAT-PD_SC*_{score}_Submit_Final.csv')
    submit[score] = pd.read_csv(f[0])
    logging.info(f'Submission file {submit[score].shape[0]} records')
    s = submit[score].set_index('measurement_id')
    s['prediction'] = test_df[score].loc[s.index, 'a_mean']        
    
    s.loc[s['prediction']<0,'prediction']=0
    s.to_csv(f[0].replace('.csv','_Predict.csv'))