In [1]:
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd

### GLOBAL CONSTANTS
PATH_STEP1 = '/home/jaum/PG/pg/Final_Output/step1/eemd/'

CONFIG_TAG = 'config'
SCORE_TAG = 'score'
CLF_TAG = 'classifier'

KNN_TAG = 'KNeighborsClassifier(n_neighbors=1)'
RF_TAG = 'RandomForestClassifier(random_state=1010)'
SVM_TAG = 'SVC(random_state=1010)'

In [2]:
### METHODS NEEDED

#tag = classifier
#key_attr = config
#ret_attr = score
def parse_unique_tag(file_in,tag,key_attr,ret_attr):
    
    unique_tags = []
    for elem in file_in.iter(tag):
        if elem.attrib[key_attr[0]] == key_attr[1]:
            unique_tags.append(float(elem.attrib[ret_attr]))
    return unique_tags

def get_clfs_scores(file_in):
    knn_score = parse_unique_tag(file_in, CLF_TAG, (CONFIG_TAG, KNN_TAG), SCORE_TAG)
    rf_score = parse_unique_tag(file_in, CLF_TAG, (CONFIG_TAG, RF_TAG), SCORE_TAG)
    svm_score = parse_unique_tag(file_in, CLF_TAG, (CONFIG_TAG, SVM_TAG), SCORE_TAG)
    
    return knn_score, rf_score, svm_score

def get_score_results_df(normal_file_path, unbiased_file_path, noise):
    normal_file = ET.parse(normal_file_path)
    unbiased_file = ET.parse(unbiased_file_path)

    knn_normal_score, rf_normal_score, svm_normal_score = get_clfs_scores(normal_file)
    knn_unbiased_score, rf_unbiased_score, svm_unbiased_score = get_clfs_scores(unbiased_file)

    knn_normal_score_by_imfs, rf_normal_score_by_imfs, svm_normal_score_by_imfs = [], [], []
    knn_unbiased_score_by_imfs, rf_unbiased_score_by_imfs, svm_unbiased_score_by_imfs = [], [], []
    for i,hi_limit in enumerate(range(100,len(knn_normal_score)+1, 100)):
        knn_normal_score_by_imfs.append(np.mean(knn_normal_score[i*100:hi_limit]))
        rf_normal_score_by_imfs.append(np.mean(rf_normal_score[i*100:hi_limit]))
        svm_normal_score_by_imfs.append(np.mean(svm_normal_score[i*100:hi_limit]))

        knn_unbiased_score_by_imfs.append(np.mean(knn_unbiased_score[i*100:hi_limit]))
        rf_unbiased_score_by_imfs.append(np.mean(rf_unbiased_score[i*100:hi_limit]))
        svm_unbiased_score_by_imfs.append(np.mean(svm_unbiased_score[i*100:hi_limit]))

    #SEETING UP THE DATAFRAME
    normal_idx = ['Normal_Acc_Noise_{0}_Imfs_4'.format(noise),
                  'Normal_Acc_Noise_{0}_Imfs_8'.format(noise),
                  'Normal_Acc_Noise_{0}_Imfs_16'.format(noise)]
    unbiased_idx = ['Unbiased_Acc_Noise_{0}_Imfs_4'.format(noise),
                    'Unbiased_Acc_Noise_{0}_Imfs_8'.format(noise),
                    'Unbiased_Acc_Noise_{0}_Imfs_16'.format(noise)]
    #ss = series
    knn_normal_score_ss, rf_normal_score_ss, svm_normal_score_ss  = pd.Series(knn_normal_score_by_imfs, index=normal_idx), \
                                                                    pd.Series(rf_normal_score_by_imfs, index=normal_idx), \
                                                                    pd.Series(svm_normal_score_by_imfs, index=normal_idx)
    normal_score_df = pd.DataFrame([knn_normal_score_ss, rf_normal_score_ss, svm_normal_score_ss],
                                  index=[KNN_TAG, RF_TAG, SVM_TAG])

    knn_unbiased_score_ss, rf_unbiased_score_ss, svm_unbiased_score_ss = pd.Series(knn_unbiased_score_by_imfs,
                                                                                   index=unbiased_idx), \
                                                                         pd.Series(rf_unbiased_score_by_imfs,
                                                                                   index=unbiased_idx), \
                                                                         pd.Series(svm_unbiased_score_by_imfs, 
                                                                                   index=unbiased_idx)
    unbiased_score_df = pd.DataFrame([knn_unbiased_score_ss, rf_unbiased_score_ss, svm_unbiased_score_ss],
                                    index=[KNN_TAG, RF_TAG, SVM_TAG])
    
    return normal_score_df, unbiased_score_df

def get_best_config_df(normal_score_df, unbiased_score_df):
    best_normal_knn = np.argmax(normal_score_df.loc[KNN_TAG]), round(max(normal_score_df.loc[KNN_TAG]),6)
    best_normal_rf = np.argmax(normal_score_df.loc[RF_TAG]), round(max(normal_score_df.loc[RF_TAG]),6)
    best_normal_svm = np.argmax(normal_score_df.loc[SVM_TAG]), round(max(normal_score_df.loc[SVM_TAG]),6)
    
    best_normal_df = pd.DataFrame([pd.Series({'Best_Normal': best_normal_knn[0], 'Score': best_normal_knn[1]}),
                                   pd.Series({'Best_Normal': best_normal_rf[0], 'Score': best_normal_rf[1]}),
                                   pd.Series({'Best_Normal': best_normal_svm[0], 'Score': best_normal_svm[1]})],
                                  index=[KNN_TAG,RF_TAG,SVM_TAG])

    best_unbiased_knn = np.argmax(unbiased_score_df.loc[KNN_TAG]), round(max(unbiased_score_df.loc[KNN_TAG]),6)
    best_unbiased_rf = np.argmax(unbiased_score_df.loc[RF_TAG]), round(max(unbiased_score_df.loc[RF_TAG]),6)
    best_unbiased_svm = np.argmax(unbiased_score_df.loc[SVM_TAG]), round(max(unbiased_score_df.loc[SVM_TAG]),6)
    
    best_unbiased_df = pd.DataFrame([pd.Series({'Best_Unbiased': best_unbiased_knn[0], 'Score': best_unbiased_knn[1]}),
                                     pd.Series({'Best_Unbiased': best_unbiased_rf[0], 'Score': best_unbiased_rf[1]}),
                                     pd.Series({'Best_Unbiased': best_unbiased_svm[0], 'Score': best_unbiased_svm[1]})],
                                    index=[KNN_TAG,RF_TAG,SVM_TAG])

    return pd.merge(best_normal_df, best_unbiased_df, how='inner', left_index=True, right_index=True)

def get_all_df(noise_folder,normal_file_name, unbiased_file_name,noise_strength):
    normal_file_path = PATH_STEP1 + noise_folder + normal_file_name
    unbiased_file_path = PATH_STEP1 + noise_folder + unbiased_file_name
    
    normal_score_df, unbiased_score_df = get_score_results_df(normal_file_path, unbiased_file_path, noise_strength)
    best_config_df = get_best_config_df(normal_score_df, unbiased_score_df)
    
    return normal_score_df, unbiased_score_df, best_config_df

# best_01_df, best_02_df, best_03_df, best_04_df
def get_final_df(best_dfs):
    knn_best_normal_configs, rf_best_normal_configs, svm_best_normal_configs = [], [], []
    knn_best_unbiased_configs, rf_best_unbiased_configs, svm_best_unbiased_configs = [], [], []
    #max(lis,key=lambda item:item[1])
    for _,best_df in enumerate(best_dfs):
        knn_best_normal_configs.append((best_df.loc[KNN_TAG]['Best_Normal'], best_df.loc[KNN_TAG]['Score_x']))
        rf_best_normal_configs.append((best_df.loc[RF_TAG]['Best_Normal'], best_df.loc[RF_TAG]['Score_x']))
        svm_best_normal_configs.append((best_df.loc[SVM_TAG]['Best_Normal'], best_df.loc[SVM_TAG]['Score_x']))
        
        knn_best_unbiased_configs.append((best_df.loc[KNN_TAG]['Best_Unbiased'], best_df.loc[KNN_TAG]['Score_y']))
        rf_best_unbiased_configs.append((best_df.loc[RF_TAG]['Best_Unbiased'], best_df.loc[RF_TAG]['Score_y']))
        svm_best_unbiased_configs.append((best_df.loc[SVM_TAG]['Best_Unbiased'], best_df.loc[SVM_TAG]['Score_y']))
    
    knn_best_normal = max(knn_best_normal_configs,key=lambda item:item[1])
    rf_best_normal = max(rf_best_normal_configs, key=lambda item:item[1])
    svm_best_normal = max(svm_best_normal_configs, key=lambda item:item[1])
    
    best_normal_config_df = pd.DataFrame([pd.Series({'Best_Normal': knn_best_normal[0], 'Score': knn_best_normal[1]}),
                                          pd.Series({'Best_Normal': rf_best_normal[0], 'Score': rf_best_normal[1]}),
                                          pd.Series({'Best_Normal': svm_best_normal[0], 'Score': svm_best_normal[1]})],
                                         index=[KNN_TAG, RF_TAG, SVM_TAG])
    
    knn_best_unbiased = max(knn_best_unbiased_configs, key=lambda item:item[1])
    rf_best_unbiased = max(rf_best_unbiased_configs, key=lambda item:item[1])
    svm_best_unbiased = max(svm_best_unbiased_configs, key=lambda item:item[1])
    
    best_unbiased_config_df = pd.DataFrame([pd.Series({'Best_Unbiased': knn_best_unbiased[0], 'Score': knn_best_unbiased[1]}),
                                          pd.Series({'Best_Unbiased': rf_best_unbiased[0], 'Score': rf_best_unbiased[1]}),
                                          pd.Series({'Best_Unbiased': svm_best_unbiased[0], 'Score': svm_best_unbiased[1]})],
                                         index=[KNN_TAG, RF_TAG, SVM_TAG])
    
    return pd.merge(best_normal_config_df, best_unbiased_config_df, how='inner', left_index=True, right_index=True)

In [3]:
## NOISE = 0.01 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_001_FOLDER = 'noise_001/'
NORMAL_ACC_RESULTS_FILE = 'step1_eemd001_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_eemd001_unbiased_acc_results.xml'

normal_001_score_df, unbiased_001_score_df, best_001_config_df = get_all_df(NOISE_001_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                            UNBIASED_ACC_RESULTS_FILE, 0.01)

In [4]:
normal_001_score_df

Unnamed: 0,Normal_Acc_Noise_0.01_Imfs_4,Normal_Acc_Noise_0.01_Imfs_8,Normal_Acc_Noise_0.01_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.979815,0.979718,0.97925
RandomForestClassifier(random_state=1010),0.989204,0.991373,0.98841
SVC(random_state=1010),0.232768,0.221836,0.191127


In [5]:
unbiased_001_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.01_Imfs_4,Unbiased_Acc_Noise_0.01_Imfs_8,Unbiased_Acc_Noise_0.01_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.761297,0.761607,0.755108
RandomForestClassifier(random_state=1010),0.784295,0.831716,0.820113
SVC(random_state=1010),0.136402,0.131565,0.088962


In [6]:
best_001_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.01_Imfs_4,0.979815,Unbiased_Acc_Noise_0.01_Imfs_8,0.761607
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.01_Imfs_8,0.991373,Unbiased_Acc_Noise_0.01_Imfs_8,0.831716
SVC(random_state=1010),Normal_Acc_Noise_0.01_Imfs_4,0.232768,Unbiased_Acc_Noise_0.01_Imfs_4,0.136402


In [7]:
## NOISE = 0.05 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_005_FOLDER = 'noise_005/'
NORMAL_ACC_RESULTS_FILE = 'step1_eemd005_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_eemd005_unbiased_acc_results.xml'

normal_005_score_df, unbiased_005_score_df, best_005_config_df = get_all_df(NOISE_005_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                            UNBIASED_ACC_RESULTS_FILE, 0.05)

In [8]:
normal_005_score_df

Unnamed: 0,Normal_Acc_Noise_0.05_Imfs_4,Normal_Acc_Noise_0.05_Imfs_8,Normal_Acc_Noise_0.05_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.979353,0.979473,0.979473
RandomForestClassifier(random_state=1010),0.991706,0.993103,0.991152
SVC(random_state=1010),0.233107,0.221711,0.19054


In [9]:
unbiased_005_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.05_Imfs_4,Unbiased_Acc_Noise_0.05_Imfs_8,Unbiased_Acc_Noise_0.05_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.758217,0.759261,0.75248
RandomForestClassifier(random_state=1010),0.795877,0.844635,0.825324
SVC(random_state=1010),0.136433,0.131862,0.088884


In [10]:
best_005_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.05_Imfs_8,0.979473,Unbiased_Acc_Noise_0.05_Imfs_8,0.759261
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.05_Imfs_8,0.993103,Unbiased_Acc_Noise_0.05_Imfs_8,0.844635
SVC(random_state=1010),Normal_Acc_Noise_0.05_Imfs_4,0.233107,Unbiased_Acc_Noise_0.05_Imfs_4,0.136433


In [11]:
## NOISE = 0.01 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_01_FOLDER = 'noise_01/'
NORMAL_ACC_RESULTS_FILE = 'step1_eemd01_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_eemd01_unbiased_acc_results.xml'

normal_01_score_df, unbiased_01_score_df, best_01_config_df = get_all_df(NOISE_01_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                            UNBIASED_ACC_RESULTS_FILE, 0.1)

In [12]:
normal_01_score_df

Unnamed: 0,Normal_Acc_Noise_0.1_Imfs_4,Normal_Acc_Noise_0.1_Imfs_8,Normal_Acc_Noise_0.1_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.979294,0.97935,0.979601
RandomForestClassifier(random_state=1010),0.992534,0.994349,0.992433
SVC(random_state=1010),0.233547,0.222458,0.18995


In [13]:
unbiased_01_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.1_Imfs_4,Unbiased_Acc_Noise_0.1_Imfs_8,Unbiased_Acc_Noise_0.1_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.757737,0.759335,0.751988
RandomForestClassifier(random_state=1010),0.794527,0.85799,0.844676
SVC(random_state=1010),0.137133,0.132256,0.088976


In [14]:
best_01_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.1_Imfs_16,0.979601,Unbiased_Acc_Noise_0.1_Imfs_8,0.759335
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.1_Imfs_8,0.994349,Unbiased_Acc_Noise_0.1_Imfs_8,0.85799
SVC(random_state=1010),Normal_Acc_Noise_0.1_Imfs_4,0.233547,Unbiased_Acc_Noise_0.1_Imfs_4,0.137133


In [15]:
## NOISE = 0.2 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_02_FOLDER = 'noise_02/'
NORMAL_ACC_RESULTS_FILE = 'step1_eemd02_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_eemd02_unbiased_acc_results.xml'

normal_02_score_df, unbiased_02_score_df, best_02_config_df = get_all_df(NOISE_02_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                            UNBIASED_ACC_RESULTS_FILE, 0.2)

In [16]:
normal_02_score_df

Unnamed: 0,Normal_Acc_Noise_0.2_Imfs_4,Normal_Acc_Noise_0.2_Imfs_8,Normal_Acc_Noise_0.2_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.978326,0.977981,0.977981
RandomForestClassifier(random_state=1010),0.993293,0.99535,0.994403
SVC(random_state=1010),0.234665,0.224033,0.189141


In [17]:
unbiased_02_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.2_Imfs_4,Unbiased_Acc_Noise_0.2_Imfs_8,Unbiased_Acc_Noise_0.2_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.757561,0.759062,0.7518
RandomForestClassifier(random_state=1010),0.803953,0.868365,0.860771
SVC(random_state=1010),0.139183,0.133249,0.089203


In [18]:
best_02_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.2_Imfs_4,0.978326,Unbiased_Acc_Noise_0.2_Imfs_8,0.759062
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.2_Imfs_8,0.99535,Unbiased_Acc_Noise_0.2_Imfs_8,0.868365
SVC(random_state=1010),Normal_Acc_Noise_0.2_Imfs_4,0.234665,Unbiased_Acc_Noise_0.2_Imfs_4,0.139183


In [19]:
## NOISE = 0.3 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_03_FOLDER = 'noise_03/'
NORMAL_ACC_RESULTS_FILE = 'step1_eemd03_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_eemd03_unbiased_acc_results.xml'

normal_03_score_df, unbiased_03_score_df, best_03_config_df = get_all_df(NOISE_03_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                            UNBIASED_ACC_RESULTS_FILE, 0.3)

In [20]:
normal_03_score_df

Unnamed: 0,Normal_Acc_Noise_0.3_Imfs_4,Normal_Acc_Noise_0.3_Imfs_8,Normal_Acc_Noise_0.3_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.977142,0.975405,0.975965
RandomForestClassifier(random_state=1010),0.99384,0.9958,0.994269
SVC(random_state=1010),0.236029,0.225089,0.188365


In [21]:
unbiased_03_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.3_Imfs_4,Unbiased_Acc_Noise_0.3_Imfs_8,Unbiased_Acc_Noise_0.3_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.760969,0.760026,0.752708
RandomForestClassifier(random_state=1010),0.792924,0.874571,0.861079
SVC(random_state=1010),0.140404,0.13355,0.088889


In [22]:
best_03_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.3_Imfs_4,0.977142,Unbiased_Acc_Noise_0.3_Imfs_4,0.760969
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.3_Imfs_8,0.9958,Unbiased_Acc_Noise_0.3_Imfs_8,0.874571
SVC(random_state=1010),Normal_Acc_Noise_0.3_Imfs_4,0.236029,Unbiased_Acc_Noise_0.3_Imfs_4,0.140404


In [23]:
## NOISE = 0.4 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with ACCURACY score
NOISE_04_FOLDER = 'noise_04/'
NORMAL_ACC_RESULTS_FILE = 'step1_eemd04_normal_acc_results.xml'
UNBIASED_ACC_RESULTS_FILE = 'step1_eemd04_unbiased_acc_results.xml'

normal_04_score_df, unbiased_04_score_df, best_04_config_df = get_all_df(NOISE_04_FOLDER, NORMAL_ACC_RESULTS_FILE,
                                                                            UNBIASED_ACC_RESULTS_FILE, 0.4)

In [24]:
normal_04_score_df

Unnamed: 0,Normal_Acc_Noise_0.4_Imfs_4,Normal_Acc_Noise_0.4_Imfs_8,Normal_Acc_Noise_0.4_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.975689,0.975104,0.975104
RandomForestClassifier(random_state=1010),0.993373,0.994919,0.994668
SVC(random_state=1010),0.236986,0.226456,0.186875


In [25]:
unbiased_04_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.4_Imfs_4,Unbiased_Acc_Noise_0.4_Imfs_8,Unbiased_Acc_Noise_0.4_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.752857,0.753615,0.747293
RandomForestClassifier(random_state=1010),0.794824,0.869414,0.853336
SVC(random_state=1010),0.141623,0.134418,0.088827


In [26]:
best_04_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.4_Imfs_4,0.975689,Unbiased_Acc_Noise_0.4_Imfs_8,0.753615
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.4_Imfs_8,0.994919,Unbiased_Acc_Noise_0.4_Imfs_8,0.869414
SVC(random_state=1010),Normal_Acc_Noise_0.4_Imfs_4,0.236986,Unbiased_Acc_Noise_0.4_Imfs_4,0.141623


In [27]:
best_configs = [best_001_config_df, best_005_config_df, best_01_config_df,
                best_02_config_df, best_03_config_df,best_04_config_df]
get_final_df(best_configs)

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.01_Imfs_4,0.979815,Unbiased_Acc_Noise_0.01_Imfs_8,0.761607
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.3_Imfs_8,0.9958,Unbiased_Acc_Noise_0.3_Imfs_8,0.874571
SVC(random_state=1010),Normal_Acc_Noise_0.4_Imfs_4,0.236986,Unbiased_Acc_Noise_0.4_Imfs_4,0.141623
