# Session score
Generate a single score for the session based on the different segment scores

In [4]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')

import warnings
warnings.filterwarnings('ignore')

import os
import sys
import numpy as np
import pandas as pd
from multiprocessing import Pool

from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel, f_regression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import imblearn
import getpass

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import Classification_utils as helpFunc

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set scoring parameters

In [5]:
def runClassification(train, test, labelName, useFirstHalf, selectFeatures, fixClassImbalance = 'none', seed=np.nan, regressionModel = False):
    #use only first half of session

    if useFirstHalf:
        train = helpFunc.cutSession(train)
        test = helpFunc.cutSession(test)

    train, _ = helpFunc.splitData(train, labelName=labelName, testSize=testSize, seed=seed)
    # remove Nans
    train = train.dropna(subset=[labelName])
    test = test.dropna(subset=[labelName])

    # Get X and y (train and test ) and labels
    # reassign indexes of train and test  (which changed after manipulations) to match simple range
    train.index = range(len(train))
    test.index = range(len(test))

    X_train = train.drop(['sessionID'] + y_fields, axis=1)
    y_train = train[labelName]
    X_test = test.drop(['sessionID']  + y_fields, axis=1)


    # Normalization
    min_max_scaler = preprocessing.MinMaxScaler(feature_range = (-1,1))
    min_max_scaler.fit(pd.concat([X_train, X_test], ignore_index=True))
    X_train= min_max_scaler.transform(X_train)
    X_test= min_max_scaler.transform(X_test)

    # feature selection
    selFeat = selectFeatures.fit(X_train, y_train)
    # apply feature selection
    X_train = selFeat.transform(X_train)
    X_test = selFeat.transform(X_test)

    # handle class imbalance
    if fixClassImbalance != 'none':
        if fixClassImbalance=='SMOTEENN':
            smote_enn = imblearn.combine.SMOTEENN(random_state=0)
            X_train, y_train = smote_enn.fit_resample(X_train, y_train)
        elif fixClassImbalance == 'RandomUnderSampler':
            rus = imblearn.under_sampling.RandomUnderSampler(random_state=42)
            X_train, y_train = rus.fit_resample(X_train, y_train)

    # run models
    if regressionModel:
        clf = RandomForestRegressor(random_state = seed)
    else:
        clf = RandomForestClassifier(random_state = seed)
    clf.fit(X_train, y_train)

    all_y_pred = clf.predict(X_test)

    sessionIDs = test['sessionID'].unique()
    y_pred = np.zeros(len(sessionIDs))

    for i, sessionID in enumerate(sessionIDs):
        sessionIdx = test.index[test['sessionID']==sessionID].tolist()
        y_pred[i] = all_y_pred[sessionIdx].astype(np.intp).mean()

    d = {'sessionID': sessionIDs, 'y_pred': y_pred}
    y_predictions = pd.DataFrame(data=d)

    return sessionIDs, y_pred

In [6]:
def SessionScoreSeed(seed):
    all_results, all_pred = {}, {}
    # load tables
    for i, labelName in enumerate(y_fields):

        resFile = os.path.join('features_seeds_summary', dataset.lower(), test_rand, labelName+'_allResults_seed'+str(seed)+'.csv')
        if dataset == 'cis':
            predFile = os.path.join('features', 'BEAT-PD_SC' + str(i+1) + '_'+labelName +'_'+test_rand+'.csv')
        else:
            predFile = os.path.join(outputFolder, 'BEAT-PD_SC' + str(i+1) + '_'+labelName +'_'+ str(seed) + '.csv')
        
        logging.info(f'Seed {seed} Score {labelName}: Input {resFile} Output {predFile}')
        dfRes = pd.read_csv(resFile)
        dfRes = dfRes.set_index(list(dfRes.columns[[0]]))
        dfPred = pd.read_csv(predFile)
        
        all_results[labelName] = dfRes
        all_pred[labelName]= dfPred

    for id in subject_ids:
        subjectTrainFolderName = dataSetFolderName + str(id)
        subjectTestFolderName = dataSetFolderName + str(id) + '_test'
        
        data= pd.read_csv(os.path.join(subjectTrainFolderName, feature_file), delim_whitespace=True, header=None, mangle_dupe_cols=True)
        data['sessionID'] = pd.read_csv(os.path.join(subjectTrainFolderName ,'sessionIDs.txt'), header=None, squeeze=True)

        # add labels
        for y in y_fields:
            y_data = pd.read_csv(os.path.join(subjectTrainFolderName, 'y_'+ y + '.txt'), names=[y], squeeze=True)
            #logging.info(f'{id} - Reading {y} data {y_data.shape}')
            data[y] = y_data
        train = data[data['sessionID'].str.replace(' ','').isin(sessionIDs_train)]
        test = data[data['sessionID'].str.replace(' ','').isin(sessionIDs_test)]
            

        for labelName in y_fields:
            # if all values for the current labels are Nans, continue
            if train[labelName].isnull().all():
                continue

            dfRes = all_results[labelName].loc[id]

            dfPred = all_pred[labelName]

            bestField = dfRes.index[dfRes.argmin()]
 
            logging.info(f'Seed {seed} Subject {id} Score {labelName} Best method {bestField}')

            if bestField == labelName + '_useWholeS':
                useFirstHalf = False
            else: 
                useFirstHalf = True
                
            if bestField == labelName + '_RFregressor':
                regressionModel = True
            else:
                regressionModel = False
            
            if bestField == labelName + '_kBest50':
                selectFeatures = SelectKBest(score_func=f_classif, k=50)
            elif bestField == labelName + '_rfFeatueSel':
                selectFeatures = SelectFromModel(RandomForestClassifier(random_state = seed))
            elif bestField == labelName + '_RFregressor':
                selectFeatures = SelectKBest(score_func=f_regression, k=selectKbest)
            else:
                selectFeatures = SelectKBest(score_func=f_classif, k=selectKbest)
                

            if bestField == labelName + '_undersample':
                fixClassImbalance = 'RandomUnderSampler'
            else:
                fixClassImbalance = 'none'
                
            if bestField == labelName + '_bestCombination':
                mseNaive = dfRes[labelName+'_naive']
                mseBase = dfRes[labelName+'_base']
                mseUseWholeS = dfRes[labelName+'_useWholeS']
                mseKbest50 = dfRes[labelName+'_kBest50']
                mseRfFeatueSel = dfRes[labelName+'_rfFeatureSel']
#                 mseSmotenn = dfRes[labelName+'_SMOTENN']
                mseUndersample = dfRes[labelName+'_undersample']
                mseBestComp = dfRes[labelName+'_bestCombination']
                
                if (mseBase < mseUseWholeS):
                    useFirstHalf = True
                else:
                    useFirstHalf = False
            
                minSelFeatScore = min(mseBase, mseKbest50, mseRfFeatueSel)
                if minSelFeatScore == mseBase:
                    selectFeatures = SelectKBest(score_func=f_classif, k=selectKbest)  #default
                elif minSelFeatScore == mseKbest50:
                    selectFeatures = SelectKBest(score_func=f_classif, k=50)
                elif minSelFeatScore == mseRfFeatueSel:
                    selectFeatures = SelectFromModel(RandomForestClassifier(random_state = seed))
            
                minClassBalScore = min(mseBase, mseUndersample)
                if minClassBalScore == mseBase:
                    fixClassImbalance = 'none'
                elif minClassBalScore == mseUndersample:
                    fixClassImbalance = 'RandomUnderSampler'    
                

            sessionIDs, y_pred = runClassification(train, test, labelName, useFirstHalf, selectFeatures, fixClassImbalance = fixClassImbalance, seed=seed)
            # update submission tables with the predictions
            for i, sessionID in enumerate(sessionIDs):
                sessionID = sessionID.replace(" ","")
                idx = dfPred[dfPred['measurement_id'] == sessionID ].index.tolist()
                if len(idx)==0:
                    logging.info('session not found in pred matrix' + sessionID)
                    continue
                dfPred.at[idx[0],'prediction'] = y_pred[i]

    # save
    for i, labelName in enumerate(y_fields):
        fileName = 'BEAT-PD_SC' + str(i+1) + '_'+ labelName + '_'+ str(seed) +'.csv'
        logging.info(f'Save file: {fileName}')
        dfPred = all_pred[labelName]
        dfPred = dfPred.set_index(list(dfPred.columns[[0]]))
        dfPred.to_csv(os.path.join(outputFolder, fileName))

### Set parameters

In [7]:
datasets = ['cis', 'real']
test_rands = ['S34_K0', 'S34_K1', 'S34_K2', 'S34_K3','S34_K4']
dataSetFolderName = os.path.join('..', 'pre_process', 'SegmentedData_winLen500_overlap250_')
selectKbest = 100
testSize = 0.2
y_fields = ['on_off', 'dyskinesia', 'tremor']

feature_file = 'X_data_fullFeatures.txt'

parallel = True        # Parallel (True) or serial (False) seed procesing
nworkers = 10          # Number of parallel workers
seeds = range(1,11)

In [8]:
for test_rand in test_rands:
    outputFolder = os.path.join('features', test_rand)
    if not os.path.isdir(outputFolder):
        logging.info(f'Making output dir: {outputFolder}')
        os.mkdir(outputFolder)

    for dataset in datasets:
        labels_folder = 'data'

        fileName = os.path.join(labels_folder, dataset.upper() + f'-PD_Training_Data_IDs_Labels_{test_rand}.csv')

        allSessions_train = pd.read_csv(os.path.join(labels_folder ,dataset.upper() + f'-PD_Training_Data_IDs_Labels_{test_rand}.csv'))
        sessionIDs_train = allSessions_train['measurement_id'].to_list()
        allSessions_test = pd.read_csv(os.path.join(labels_folder ,dataset.upper() + f'-PD_Test_Data_IDs_Labels_{test_rand}.csv'))
        sessionIDs_test = allSessions_test['measurement_id'].to_list()

        allSessions = pd.read_csv(fileName)
        subject_ids = allSessions['subject_id'].unique()
        logging.info(f'Training data: {subject_ids.shape[0]} unique subjects in {allSessions.shape[0]} sessions')    

        if parallel:
            logging.info(f'Parellel processing of {len(seeds)} seeds using {nworkers} workers')
            with Pool(nworkers) as p:
                p.map(SessionScoreSeed, seeds)
        else:
            logging.info(f'Seriel processing of {len(seeds)} seeds')
            for seed in seeds:
                SessionScoreSeed(seed)

INFO: 2020-05-18 12:37:17,944: Making output dir: features/S34_K0
INFO: 2020-05-18 12:37:17,951: Training data: 16 unique subjects in 1486 sessions
INFO: 2020-05-18 12:37:17,952: Parellel processing of 10 seeds using 10 workers
INFO: 2020-05-18 12:37:17,982: Seed 1 Score on_off: Input features_seeds_summary/cis/S34_K0/on_off_allResults_seed1.csv Output features/BEAT-PD_SC1_on_off_S34_K0.csv
INFO: 2020-05-18 12:37:17,983: Seed 3 Score on_off: Input features_seeds_summary/cis/S34_K0/on_off_allResults_seed3.csv Output features/BEAT-PD_SC1_on_off_S34_K0.csv
INFO: 2020-05-18 12:37:17,983: Seed 2 Score on_off: Input features_seeds_summary/cis/S34_K0/on_off_allResults_seed2.csv Output features/BEAT-PD_SC1_on_off_S34_K0.csv
INFO: 2020-05-18 12:37:17,983: Seed 4 Score on_off: Input features_seeds_summary/cis/S34_K0/on_off_allResults_seed4.csv Output features/BEAT-PD_SC1_on_off_S34_K0.csv
INFO: 2020-05-18 12:37:17,983: Seed 5 Score on_off: Input features_seeds_summary/cis/S34_K0/on_off_allResult

FileNotFoundError: [Errno 2] File features_seeds_summary/cis/S34_K3/on_off_allResults_seed8.csv does not exist: 'features_seeds_summary/cis/S34_K3/on_off_allResults_seed8.csv'