In [1]:
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd

### GLOBAL CONSTANTS
PATH_STEP1 = '/home/jaum/PG/pg/Final_Output/step1/ceemdan/'

CONFIG_TAG = 'config'
SCORE_TAG = 'score'
CLF_TAG = 'classifier'

KNN_TAG = 'KNeighborsClassifier(n_neighbors=1)'
RF_TAG = 'RandomForestClassifier(random_state=1010)'
SVM_TAG = 'SVC(random_state=1010)'


In [30]:
### METHODS NEEDED

#tag = classifier
#key_attr = config
#ret_attr = score
def parse_unique_tag(file_in,tag,key_attr,ret_attr):
    
    unique_tags = []
    for elem in file_in.iter(tag):
        if elem.attrib[key_attr[0]] == key_attr[1]:
            unique_tags.append(float(elem.attrib[ret_attr]))
    return unique_tags

def get_clfs_scores(file_in):
    knn_score = parse_unique_tag(file_in, CLF_TAG, (CONFIG_TAG, KNN_TAG), SCORE_TAG)
    rf_score = parse_unique_tag(file_in, CLF_TAG, (CONFIG_TAG, RF_TAG), SCORE_TAG)
    svm_score = parse_unique_tag(file_in, CLF_TAG, (CONFIG_TAG, SVM_TAG), SCORE_TAG)
    
    return knn_score, rf_score, svm_score

def get_score_results_df(normal_file_path, unbiased_file_path, noise):
    normal_file = ET.parse(normal_file_path)
    unbiased_file = ET.parse(unbiased_file_path)

    knn_normal_score, rf_normal_score, svm_normal_score = get_clfs_scores(normal_file)
    knn_unbiased_score, rf_unbiased_score, svm_unbiased_score = get_clfs_scores(unbiased_file)

    knn_normal_score_by_imfs, rf_normal_score_by_imfs, svm_normal_score_by_imfs = [], [], []
    knn_unbiased_score_by_imfs, rf_unbiased_score_by_imfs, svm_unbiased_score_by_imfs = [], [], []
    for i,hi_limit in enumerate(range(100,len(knn_normal_score)+1, 100)):
        knn_normal_score_by_imfs.append(np.mean(knn_normal_score[i*100:hi_limit]))
        rf_normal_score_by_imfs.append(np.mean(rf_normal_score[i*100:hi_limit]))
        svm_normal_score_by_imfs.append(np.mean(svm_normal_score[i*100:hi_limit]))

        knn_unbiased_score_by_imfs.append(np.mean(knn_unbiased_score[i*100:hi_limit]))
        rf_unbiased_score_by_imfs.append(np.mean(rf_unbiased_score[i*100:hi_limit]))
        svm_unbiased_score_by_imfs.append(np.mean(svm_unbiased_score[i*100:hi_limit]))

    #SEETING UP THE DATAFRAME
    normal_idx = ['Normal_Acc_Noise_{0}_Imfs_4'.format(noise),
                  'Normal_Acc_Noise_{0}_Imfs_8'.format(noise),
                  'Normal_Acc_Noise_{0}_Imfs_16'.format(noise)]
    unbiased_idx = ['Unbiased_Acc_Noise_{0}_Imfs_4'.format(noise),
                    'Unbiased_Acc_Noise_{0}_Imfs_8'.format(noise),
                    'Unbiased_Acc_Noise_{0}_Imfs_16'.format(noise)]
    #ss = series
    knn_normal_score_ss, rf_normal_score_ss, svm_normal_score_ss  = pd.Series(knn_normal_score_by_imfs, index=normal_idx), \
                                                                    pd.Series(rf_normal_score_by_imfs, index=normal_idx), \
                                                                    pd.Series(svm_normal_score_by_imfs, index=normal_idx)
    normal_score_df = pd.DataFrame([knn_normal_score_ss, rf_normal_score_ss, svm_normal_score_ss],
                                  index=[KNN_TAG, RF_TAG, SVM_TAG])

    knn_unbiased_score_ss, rf_unbiased_score_ss, svm_unbiased_score_ss = pd.Series(knn_unbiased_score_by_imfs,
                                                                                   index=unbiased_idx), \
                                                                         pd.Series(rf_unbiased_score_by_imfs,
                                                                                   index=unbiased_idx), \
                                                                         pd.Series(svm_unbiased_score_by_imfs, 
                                                                                   index=unbiased_idx)
    unbiased_score_df = pd.DataFrame([knn_unbiased_score_ss, rf_unbiased_score_ss, svm_unbiased_score_ss],
                                    index=[KNN_TAG, RF_TAG, SVM_TAG])
    
    return normal_score_df, unbiased_score_df

def get_best_config_df(normal_score_df, unbiased_score_df):
    best_normal_knn = np.argmax(normal_score_df.loc[KNN_TAG]), round(max(normal_score_df.loc[KNN_TAG]),6)
    best_normal_rf = np.argmax(normal_score_df.loc[RF_TAG]), round(max(normal_score_df.loc[RF_TAG]),6)
    best_normal_svm = np.argmax(normal_score_df.loc[SVM_TAG]), round(max(normal_score_df.loc[SVM_TAG]),6)
    
    best_normal_df = pd.DataFrame([pd.Series({'Best_Normal': best_normal_knn[0], 'Score': best_normal_knn[1]}),
                                   pd.Series({'Best_Normal': best_normal_rf[0], 'Score': best_normal_rf[1]}),
                                   pd.Series({'Best_Normal': best_normal_svm[0], 'Score': best_normal_svm[1]})],
                                  index=[KNN_TAG,RF_TAG,SVM_TAG])

    best_unbiased_knn = np.argmax(unbiased_score_df.loc[KNN_TAG]), round(max(unbiased_score_df.loc[KNN_TAG]),6)
    best_unbiased_rf = np.argmax(unbiased_score_df.loc[RF_TAG]), round(max(unbiased_score_df.loc[RF_TAG]),6)
    best_unbiased_svm = np.argmax(unbiased_score_df.loc[SVM_TAG]), round(max(unbiased_score_df.loc[SVM_TAG]),6)
    
    best_unbiased_df = pd.DataFrame([pd.Series({'Best_Unbiased': best_unbiased_knn[0], 'Score': best_unbiased_knn[1]}),
                                     pd.Series({'Best_Unbiased': best_unbiased_rf[0], 'Score': best_unbiased_rf[1]}),
                                     pd.Series({'Best_Unbiased': best_unbiased_svm[0], 'Score': best_unbiased_svm[1]})],
                                    index=[KNN_TAG,RF_TAG,SVM_TAG])

    return pd.merge(best_normal_df, best_unbiased_df, how='inner', left_index=True, right_index=True)

def get_all_df(noise_folder,normal_file_name, unbiased_file_name,noise_strength):
    normal_file_path = PATH_STEP1 + noise_folder + normal_file_name
    unbiased_file_path = PATH_STEP1 + noise_folder + unbiased_file_name
    
    normal_score_df, unbiased_score_df = get_score_results_df(normal_file_path, unbiased_file_path, noise_strength)
    best_config_df = get_best_config_df(normal_score_df, unbiased_score_df)
    
    return normal_score_df, unbiased_score_df, best_config_df

# best_01_df, best_02_df, best_03_df, best_04_df
def get_final_df(best_dfs):
    knn_best_normal_configs, rf_best_normal_configs, svm_best_normal_configs = [], [], []
    knn_best_unbiased_configs, rf_best_unbiased_configs, svm_best_unbiased_configs = [], [], []
    #max(lis,key=lambda item:item[1])
    for _,best_df in enumerate(best_dfs):
        knn_best_normal_configs.append((best_df.loc[KNN_TAG]['Best_Normal'], best_df.loc[KNN_TAG]['Score_x']))
        rf_best_normal_configs.append((best_df.loc[RF_TAG]['Best_Normal'], best_df.loc[RF_TAG]['Score_x']))
        svm_best_normal_configs.append((best_df.loc[SVM_TAG]['Best_Normal'], best_df.loc[SVM_TAG]['Score_x']))
        
        knn_best_unbiased_configs.append((best_df.loc[KNN_TAG]['Best_Unbiased'], best_df.loc[KNN_TAG]['Score_y']))
        rf_best_unbiased_configs.append((best_df.loc[RF_TAG]['Best_Unbiased'], best_df.loc[RF_TAG]['Score_y']))
        svm_best_unbiased_configs.append((best_df.loc[SVM_TAG]['Best_Unbiased'], best_df.loc[SVM_TAG]['Score_y']))
    
    knn_best_normal = max(knn_best_normal_configs,key=lambda item:item[1])
    rf_best_normal = max(rf_best_normal_configs, key=lambda item:item[1])
    svm_best_normal = max(svm_best_normal_configs, key=lambda item:item[1])
    
    best_normal_config_df = pd.DataFrame([pd.Series({'Best_Normal': knn_best_normal[0], 'Score': knn_best_normal[1]}),
                                          pd.Series({'Best_Normal': rf_best_normal[0], 'Score': rf_best_normal[1]}),
                                          pd.Series({'Best_Normal': svm_best_normal[0], 'Score': svm_best_normal[1]})],
                                         index=[KNN_TAG, RF_TAG, SVM_TAG])
    
    knn_best_unbiased = max(knn_best_unbiased_configs, key=lambda item:item[1])
    rf_best_unbiased = max(rf_best_unbiased_configs, key=lambda item:item[1])
    svm_best_unbiased = max(svm_best_unbiased_configs, key=lambda item:item[1])
    
    best_unbiased_config_df = pd.DataFrame([pd.Series({'Best_Unbiased': knn_best_unbiased[0], 'Score': knn_best_unbiased[1]}),
                                          pd.Series({'Best_Unbiased': rf_best_unbiased[0], 'Score': rf_best_unbiased[1]}),
                                          pd.Series({'Best_Unbiased': svm_best_unbiased[0], 'Score': svm_best_unbiased[1]})],
                                         index=[KNN_TAG, RF_TAG, SVM_TAG])
    
    return pd.merge(best_normal_config_df, best_unbiased_config_df, how='inner', left_index=True, right_index=True)

In [3]:
## NOISE = 0.01 --> Analyzing the best CEEMDAN number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_001_FOLDER = 'noise_001/'
NORMAL_ACC_RESULTS_FILE = 'step1_ceemdan001_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_ceemdan001_unbiased_acc_results.xml'

normal_001_score_df, unbiased_001_score_df, best_001_config_df = get_all_df(NOISE_001_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                            UNBIASED_ACC_RESULTS_FILE, 0.01)

In [4]:
normal_001_score_df

Unnamed: 0,Normal_Acc_Noise_0.01_Imfs_4,Normal_Acc_Noise_0.01_Imfs_8,Normal_Acc_Noise_0.01_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.979502,0.979378,0.979381
RandomForestClassifier(random_state=1010),0.989749,0.989295,0.986548
SVC(random_state=1010),0.232986,0.221927,0.191223


In [5]:
unbiased_001_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.01_Imfs_4,Unbiased_Acc_Noise_0.01_Imfs_8,Unbiased_Acc_Noise_0.01_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.764003,0.764794,0.764003
RandomForestClassifier(random_state=1010),0.775879,0.836938,0.818719
SVC(random_state=1010),0.136765,0.131634,0.088998


In [6]:
best_001_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.01_Imfs_4,0.979502,Unbiased_Acc_Noise_0.01_Imfs_8,0.764794
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.01_Imfs_4,0.989749,Unbiased_Acc_Noise_0.01_Imfs_8,0.836938
SVC(random_state=1010),Normal_Acc_Noise_0.01_Imfs_4,0.232986,Unbiased_Acc_Noise_0.01_Imfs_4,0.136765


In [7]:
## NOISE = 0.05 --> Analyzing the best CEEMDAN number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_005_FOLDER = 'noise_005/'
NORMAL_ACC_RESULTS_FILE = 'step1_ceemdan005_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_ceemdan005_unbiased_acc_results.xml'

normal_005_score_df, unbiased_005_score_df, best_005_config_df = get_all_df(NOISE_005_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                            UNBIASED_ACC_RESULTS_FILE, 0.05)

In [8]:
normal_005_score_df

Unnamed: 0,Normal_Acc_Noise_0.05_Imfs_4,Normal_Acc_Noise_0.05_Imfs_8,Normal_Acc_Noise_0.05_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.978934,0.978012,0.978292
RandomForestClassifier(random_state=1010),0.991158,0.99257,0.991076
SVC(random_state=1010),0.233197,0.221466,0.19054


In [9]:
unbiased_005_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.05_Imfs_4,Unbiased_Acc_Noise_0.05_Imfs_8,Unbiased_Acc_Noise_0.05_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.76949,0.762353,0.758986
RandomForestClassifier(random_state=1010),0.804173,0.863434,0.854128
SVC(random_state=1010),0.136515,0.131724,0.088906


In [10]:
best_005_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.05_Imfs_4,0.978934,Unbiased_Acc_Noise_0.05_Imfs_4,0.76949
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.05_Imfs_8,0.99257,Unbiased_Acc_Noise_0.05_Imfs_8,0.863434
SVC(random_state=1010),Normal_Acc_Noise_0.05_Imfs_4,0.233197,Unbiased_Acc_Noise_0.05_Imfs_4,0.136515


In [11]:
## NOISE = 0.1 --> Analyzing the best CEEMDAN number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_01_FOLDER = 'noise_01/'
NORMAL_ACC_RESULTS_FILE = 'step1_ceemdan01_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_ceemdan01_unbiased_acc_results.xml'

normal_01_score_df, unbiased_01_score_df, best_01_config_df = get_all_df(NOISE_01_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                         UNBIASED_ACC_RESULTS_FILE, 0.1)

In [12]:
normal_01_score_df


Unnamed: 0,Normal_Acc_Noise_0.1_Imfs_4,Normal_Acc_Noise_0.1_Imfs_8,Normal_Acc_Noise_0.1_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.978893,0.976,0.975968
RandomForestClassifier(random_state=1010),0.99191,0.99312,0.991796
SVC(random_state=1010),0.233948,0.222272,0.18995


In [13]:
unbiased_01_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.1_Imfs_4,Unbiased_Acc_Noise_0.1_Imfs_8,Unbiased_Acc_Noise_0.1_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.771184,0.76875,0.765571
RandomForestClassifier(random_state=1010),0.799239,0.839442,0.817573
SVC(random_state=1010),0.137353,0.132142,0.088942


In [14]:
best_01_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.1_Imfs_4,0.978893,Unbiased_Acc_Noise_0.1_Imfs_4,0.771184
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.1_Imfs_8,0.99312,Unbiased_Acc_Noise_0.1_Imfs_8,0.839442
SVC(random_state=1010),Normal_Acc_Noise_0.1_Imfs_4,0.233948,Unbiased_Acc_Noise_0.1_Imfs_4,0.137353


In [15]:
## NOISE = 0.2 --> Analyzing the best CEEMDAN number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_02_FOLDER = 'noise_02/'
NORMAL_ACC_RESULTS_FILE = 'step1_ceemdan02_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_ceemdan02_unbiased_acc_results.xml'

normal_02_score_df, unbiased_02_score_df, best_02_config_df = get_all_df(NOISE_02_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                         UNBIASED_ACC_RESULTS_FILE, 0.2)

In [16]:
normal_02_score_df

Unnamed: 0,Normal_Acc_Noise_0.2_Imfs_4,Normal_Acc_Noise_0.2_Imfs_8,Normal_Acc_Noise_0.2_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.980446,0.978225,0.978721
RandomForestClassifier(random_state=1010),0.992248,0.996252,0.995131
SVC(random_state=1010),0.236088,0.223911,0.189141


In [17]:
unbiased_02_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.2_Imfs_4,Unbiased_Acc_Noise_0.2_Imfs_8,Unbiased_Acc_Noise_0.2_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.792479,0.780623,0.774682
RandomForestClassifier(random_state=1010),0.817114,0.856631,0.853641
SVC(random_state=1010),0.140193,0.133301,0.089134


In [18]:
best_02_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.2_Imfs_4,0.980446,Unbiased_Acc_Noise_0.2_Imfs_4,0.792479
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.2_Imfs_8,0.996252,Unbiased_Acc_Noise_0.2_Imfs_8,0.856631
SVC(random_state=1010),Normal_Acc_Noise_0.2_Imfs_4,0.236088,Unbiased_Acc_Noise_0.2_Imfs_4,0.140193


In [19]:
## NOISE = 0.3 --> Analyzing the best CEEMDAN number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_03_FOLDER = 'noise_03/'
NORMAL_ACC_RESULTS_FILE = 'step1_ceemdan03_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_ceemdan03_unbiased_acc_results.xml'

normal_03_score_df, unbiased_03_score_df, best_03_config_df = get_all_df(NOISE_03_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                         UNBIASED_ACC_RESULTS_FILE, 0.3)

In [20]:
normal_03_score_df

Unnamed: 0,Normal_Acc_Noise_0.3_Imfs_4,Normal_Acc_Noise_0.3_Imfs_8,Normal_Acc_Noise_0.3_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.978093,0.975793,0.976414
RandomForestClassifier(random_state=1010),0.992533,0.99597,0.996341
SVC(random_state=1010),0.237665,0.225057,0.188302


In [21]:
unbiased_03_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.3_Imfs_4,Unbiased_Acc_Noise_0.3_Imfs_8,Unbiased_Acc_Noise_0.3_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.785202,0.773059,0.766091
RandomForestClassifier(random_state=1010),0.812838,0.87056,0.8503
SVC(random_state=1010),0.14204,0.133546,0.088889


In [22]:
best_03_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.3_Imfs_4,0.978093,Unbiased_Acc_Noise_0.3_Imfs_4,0.785202
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.3_Imfs_16,0.996341,Unbiased_Acc_Noise_0.3_Imfs_8,0.87056
SVC(random_state=1010),Normal_Acc_Noise_0.3_Imfs_4,0.237665,Unbiased_Acc_Noise_0.3_Imfs_4,0.14204


In [23]:
## NOISE = 0.4 --> Analyzing the best CEEMDAN number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_04_FOLDER = 'noise_04/'
NORMAL_ACC_RESULTS_FILE = 'step1_ceemdan04_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_ceemdan04_unbiased_acc_results.xml'

normal_04_score_df, unbiased_04_score_df, best_04_config_df = get_all_df(NOISE_04_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                         UNBIASED_ACC_RESULTS_FILE, 0.4)

In [24]:
normal_04_score_df

Unnamed: 0,Normal_Acc_Noise_0.4_Imfs_4,Normal_Acc_Noise_0.4_Imfs_8,Normal_Acc_Noise_0.4_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.977618,0.974394,0.974732
RandomForestClassifier(random_state=1010),0.991829,0.9973,0.996033
SVC(random_state=1010),0.238774,0.226488,0.186811


In [25]:
unbiased_04_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.4_Imfs_4,Unbiased_Acc_Noise_0.4_Imfs_8,Unbiased_Acc_Noise_0.4_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.769441,0.757882,0.752349
RandomForestClassifier(random_state=1010),0.807927,0.874954,0.865654
SVC(random_state=1010),0.143487,0.134394,0.088851


In [26]:
best_04_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.4_Imfs_4,0.977618,Unbiased_Acc_Noise_0.4_Imfs_4,0.769441
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.4_Imfs_8,0.9973,Unbiased_Acc_Noise_0.4_Imfs_8,0.874954
SVC(random_state=1010),Normal_Acc_Noise_0.4_Imfs_4,0.238774,Unbiased_Acc_Noise_0.4_Imfs_4,0.143487


In [31]:
best_configs = [best_001_config_df, best_005_config_df, best_01_config_df,
                best_02_config_df, best_03_config_df,best_04_config_df]
get_final_df(best_configs)

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.2_Imfs_4,0.980446,Unbiased_Acc_Noise_0.2_Imfs_4,0.792479
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.4_Imfs_8,0.9973,Unbiased_Acc_Noise_0.4_Imfs_8,0.874954
SVC(random_state=1010),Normal_Acc_Noise_0.4_Imfs_4,0.238774,Unbiased_Acc_Noise_0.4_Imfs_4,0.143487
