In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')

import numpy as np
import pandas as pd
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

from multiprocessing import Pool

from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, SelectFromModel, f_classif, f_regression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import ClassificationUtils as helpFunc

import imblearn

%load_ext autoreload
%autoreload 2

In [None]:
dataSetFolderName = 'SegmentedData_winLen500_overlap250_'
testSize = 0.2
useFirstHalf = True
selectKbest = 100

feature_file = 'X_data_fullFeatures.txt'
dataSetFolderName = os.path.join('pre_process', dataSetFolderName)
y_fields = ['on_off', 'dyskinesia', 'tremor']
labels_folder = 'data'

seeds = range(21,40)
nworkers = 6

parallel = True

In [None]:
def runClassification(data, labelName, useFirstHalf, selectFeatures, fixClassImbalance = 'none', seed=1, regressionModel=False):
    #use only first half of session
        
    if useFirstHalf:
        data = helpFunc.cutSession(data)

    # Split to train and test
    train, test = helpFunc.splitData(data, testSize=testSize, seed=seed)        

    # remove Nans
    train = train.dropna(subset=[labelName])
    test = test.dropna(subset=[labelName])
        
    # Get X and y (train and test ) and labels
    # reassign indexes of train and test  (which changed after manipulations) to match simple range
    train.index = range(len(train))
    test.index = range(len(test))

    X_train = train.drop(['sessionID'] + y_fields, axis=1)
    y_train = train[labelName]
    X_test = test.drop(['sessionID'] + y_fields, axis=1)
    y_test = test[labelName]

    existingLabels = np.unique(np.append(y_train.unique(),y_test.unique()))
    labels = {l: str(l) for l in existingLabels}

    # Normalization
    min_max_scaler = preprocessing.MinMaxScaler(feature_range = (-1,1))
    min_max_scaler.fit(pd.concat([X_train, X_test], ignore_index=True))
    X_train= min_max_scaler.transform(X_train)
    X_test= min_max_scaler.transform(X_test)

    # feature selection              
    selFeat = selectFeatures.fit(X_train, y_train)
    # apply feature selection
    X_train = selFeat.transform(X_train)
    X_test = selFeat.transform(X_test)
    
    # handle class imbalance   
    if fixClassImbalance != 'none':
        if fixClassImbalance=='SMOTEENN':
            smote_enn = imblearn.combine.SMOTEENN(random_state=0)
            X_train, y_train = smote_enn.fit_resample(X_train, y_train)
        elif fixClassImbalance == 'RandomUnderSampler':
            rus = imblearn.under_sampling.RandomUnderSampler(random_state=42)
            X_train, y_train = rus.fit_resample(X_train, y_train)
        
    # run models
    if regressionModel:
        clf = RandomForestRegressor(random_state = seed)
    else:
        clf = RandomForestClassifier(random_state = seed)
            
    clfRes = helpFunc.runModel(clf, X_train, y_train, X_test, y_test, test, labels, labelName=labelName)         
    
    return clfRes['mseMean']
        

In [None]:
def processSeed(seed):

    logging.info(f'Ensemble models: started seed {seed} PID {os.getpid()} PPID {os.getppid()}')
    for id in subject_ids:
        subjectFolderName = dataSetFolderName + str(id)
        data = pd.read_csv(os.path.join(subjectFolderName, feature_file), delim_whitespace=True, header=None, mangle_dupe_cols=True)
        # add session number column to the dataframe
        data['sessionID'] = pd.read_csv(os.path.join(subjectFolderName ,'sessionIDs.txt'), header=None, squeeze=True)
        
        # add labels    
        for y in y_fields:
            y_data = pd.read_csv(os.path.join(subjectFolderName, 'y_'+ y + '.txt'), names=[y], squeeze=True)
            data[y] = y_data                    
        
        logging.info(f'PID {os.getpid()} PPID {os.getppid()} Subject #{id}')

        for labelName in y_fields:
            # if all values for the current labels are Nans, continue
            if data[labelName].isnull().all():
                continue

            dfRes = eval('all_results_'+labelName)
            selectFeatures = SelectKBest(score_func=f_classif, k=selectKbest)  #default

            dfRes.loc[[id],[labelName+'_naive']] = scores_naive.loc[id,labelName]

            mseBase = runClassification(data, labelName, True, selectFeatures, fixClassImbalance = 'none', seed=seed)    
            dfRes.loc[[id],[labelName+'_base']] = mseBase
            
            selectRegFeatures = SelectKBest(score_func=f_regression, k=selectKbest)  
            mseReg = runClassification(data, labelName, True, selectRegFeatures, fixClassImbalance = 'none', seed=seed, regressionModel = True)    
            dfRes.loc[[id],[labelName+'_RFregressor']] = mseReg            

            mseUseWholeS = runClassification(data, labelName, False, selectFeatures, fixClassImbalance = 'none', seed=seed)        
            dfRes.loc[[id],[labelName+'_useWholeS']] = mseUseWholeS

            selectFeatures50 = SelectKBest(score_func=f_classif, k=50)
            mseKbest50 = runClassification(data, labelName, True, selectFeatures50, fixClassImbalance = 'none', seed=seed)        
            dfRes.loc[[id],[labelName+'_kBest50']] = mseKbest50

            selectFeaturesRF = SelectFromModel(RandomForestClassifier(random_state = seed))
            mseRfFeatueSel = runClassification(data, labelName, True, selectFeaturesRF, fixClassImbalance = 'none', seed=seed)        
            dfRes.loc[[id],[labelName+'_rfFeatureSel']] = mseRfFeatueSel
            
            mseUndersample = runClassification(data, labelName, True, selectFeatures, fixClassImbalance = 'RandomUnderSampler', seed=seed)       
            dfRes.loc[[id],[labelName+'_undersample']] = mseUndersample
            
            # best combination
            if (mseBase < mseUseWholeS):
                useFirstHalf = True
            else:
                useFirstHalf = False    

            minSelFeatScore = min(mseBase, mseKbest50, mseRfFeatueSel)
            if minSelFeatScore == mseBase:
                selectFeatures = SelectKBest(score_func=f_classif, k=selectKbest)  #default
            elif minSelFeatScore == mseKbest50:
                selectFeatures = SelectKBest(score_func=f_classif, k=50)
            elif minSelFeatScore == mseRfFeatueSel:
                selectFeatures = SelectFromModel(RandomForestClassifier(random_state = seed))   

            minClassBalScore = min(mseBase, mseUndersample)
            if minClassBalScore == mseBase:
                fixClassImbalance = 'none'
            elif minClassBalScore == mseUndersample:
                fixClassImbalance = 'RandomUnderSampler'
        
            mseBestComb = runClassification(data, labelName, useFirstHalf, selectFeatures, fixClassImbalance = fixClassImbalance ,seed=seed)
            dfRes.loc[[id],[labelName+'_bestCombination']] = mseBestComb                     
            
    for labelName in y_fields:
        dfRes = eval('all_results_'+labelName)
        fileName = labelName + '_allResults_seed' + str(seed) + '.csv'
        dfRes.to_csv(os.path.join(folder_path, fileName))
    
    logging.info(f'Ensemble models: finished seed {seed}')

In [None]:
datasets = ['real', 'cis']

for dataset in datasets:
    logging.info(f'Running on dataset: {dataset}')
    label_file = os.path.join(labels_folder, dataset.upper()+'-PD_Training_Data_IDs_Labels.csv')
    trainig_labels = pd.read_csv(label_file)
    subject_ids = trainig_labels['subject_id'].unique()
    subject_count=trainig_labels['subject_id'].value_counts().sort_index()
    all_mse_results = pd.DataFrame({  'count': subject_count, 'on_off_naive' : np.nan, 'tremor_naive': np.nan, 'dyskinesia_naive': np.nan})
    
    # naive scores
    scores_naive = pd.DataFrame({'count': subject_count, 'on_off':np.nan,'tremor':np.nan, 'dyskinesia':np.nan})

    for id in subject_ids:
        subject_indexes = trainig_labels['subject_id'] == id  
        for labelName in y_fields:
            y_labels = trainig_labels.loc[subject_indexes, labelName]
            score = ((y_labels - y_labels.mean())**2).mean()
            scores_naive.loc[[id],[labelName]] = score 

    scores_naive_orig = scores_naive
    scores_naive = scores_naive.append(pd.Series(name='Total Score'))
    for labelName in y_fields:
        sqn = np.sqrt(np.ceil(scores_naive_orig.loc[:, 'count']*0.3).to_numpy(dtype=np.float32))
        sqnSum = sqn.sum()
        for column in scores_naive_orig.columns[1:]:
            score = (sqn * scores_naive_orig.loc[:, column]).sum() / sqnSum
            scores_naive.loc['Total Score',column] = score
            
    # Create results dataframes
    all_results_on_off = pd.DataFrame({  'count': subject_count})
    all_results_tremor = pd.DataFrame({  'count': subject_count})
    all_results_dyskinesia = pd.DataFrame({  'count': subject_count})
    for labelName in y_fields:
        df = eval('all_results_'+labelName)
        df[labelName+'_naive'] = np.nan
        df[labelName+'_base'] = np.nan
        df[labelName+'_useWholeS'] = np.nan   
        df[labelName+'_kBest50'] = np.nan    
        df[labelName+'_rfFeatureSel'] = np.nan
        df[labelName+'_undersample'] = np.nan
        df[labelName+'_bestCombination'] = np.nan
        df[labelName+'_RFregressor'] = np.nan# run over seeds

    folder_path = os.path.join('features_seeds_summary',dataset, 'Submit_Final')
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    warnings.filterwarnings('ignore')

    if parallel:
        with Pool(nworkers) as p:
            p.map(processSeed, seeds)
    else:
        for seed in seeds:
            a = processSeed(seed)