In [1]:
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd

### GLOBAL CONSTANTS
PATH_STEP1 = '/home/jaum/PG/pg/Final_Output/step1/eemd/'

CONFIG_TAG = 'config'
SCORE_TAG = 'score'
CLF_TAG = 'classifier'

KNN_TAG = 'KNeighborsClassifier(n_neighbors=1)'
RF_TAG = 'RandomForestClassifier(random_state=1010)'
SVM_TAG = 'SVC(random_state=1010)'

In [2]:
### METHODS NEEDED

#tag = classifier
#key_attr = config
#ret_attr = score
def parse_unique_tag(file_in,tag,key_attr,ret_attr):
    
    unique_tags = []
    for elem in file_in.iter(tag):
        if elem.attrib[key_attr[0]] == key_attr[1]:
            unique_tags.append(float(elem.attrib[ret_attr]))
    return unique_tags

def get_clfs_scores(file_in):
    knn_score = parse_unique_tag(file_in, CLF_TAG, (CONFIG_TAG, KNN_TAG), SCORE_TAG)
    rf_score = parse_unique_tag(file_in, CLF_TAG, (CONFIG_TAG, RF_TAG), SCORE_TAG)
    svm_score = parse_unique_tag(file_in, CLF_TAG, (CONFIG_TAG, SVM_TAG), SCORE_TAG)
    
    return knn_score, rf_score, svm_score

def get_score_results_df(normal_file_path, unbiased_file_path, noise):
    normal_file = ET.parse(normal_file_path)
    unbiased_file = ET.parse(unbiased_file_path)

    knn_normal_score, rf_normal_score, svm_normal_score = get_clfs_scores(normal_file)
    knn_unbiased_score, rf_unbiased_score, svm_unbiased_score = get_clfs_scores(unbiased_file)

    knn_normal_score_by_imfs, rf_normal_score_by_imfs, svm_normal_score_by_imfs = [], [], []
    knn_unbiased_score_by_imfs, rf_unbiased_score_by_imfs, svm_unbiased_score_by_imfs = [], [], []
    for i,hi_limit in enumerate(range(100,len(knn_normal_score)+1, 100)):
        knn_normal_score_by_imfs.append(np.mean(knn_normal_score[i*100:hi_limit]))
        rf_normal_score_by_imfs.append(np.mean(rf_normal_score[i*100:hi_limit]))
        svm_normal_score_by_imfs.append(np.mean(svm_normal_score[i*100:hi_limit]))

        knn_unbiased_score_by_imfs.append(np.mean(knn_unbiased_score[i*100:hi_limit]))
        rf_unbiased_score_by_imfs.append(np.mean(rf_unbiased_score[i*100:hi_limit]))
        svm_unbiased_score_by_imfs.append(np.mean(svm_unbiased_score[i*100:hi_limit]))

    #SEETING UP THE DATAFRAME
    normal_idx = ['Normal_F1_Noise_{0}_Imfs_4'.format(noise),
                  'Normal_F1_Noise_{0}_Imfs_8'.format(noise),
                  'Normal_F1_Noise_{0}_Imfs_16'.format(noise)]
    unbiased_idx = ['Unbiased_F1_Noise_{0}_Imfs_4'.format(noise),
                    'Unbiased_F1_Noise_{0}_Imfs_8'.format(noise),
                    'Unbiased_F1_Noise_{0}_Imfs_16'.format(noise)]
    #ss = series
    knn_normal_score_ss, rf_normal_score_ss, svm_normal_score_ss  = pd.Series(knn_normal_score_by_imfs, index=normal_idx), \
                                                                    pd.Series(rf_normal_score_by_imfs, index=normal_idx), \
                                                                    pd.Series(svm_normal_score_by_imfs, index=normal_idx)
    normal_score_df = pd.DataFrame([knn_normal_score_ss, rf_normal_score_ss, svm_normal_score_ss],
                                  index=[KNN_TAG, RF_TAG, SVM_TAG])

    knn_unbiased_score_ss, rf_unbiased_score_ss, svm_unbiased_score_ss = pd.Series(knn_unbiased_score_by_imfs,
                                                                                   index=unbiased_idx), \
                                                                         pd.Series(rf_unbiased_score_by_imfs,
                                                                                   index=unbiased_idx), \
                                                                         pd.Series(svm_unbiased_score_by_imfs, 
                                                                                   index=unbiased_idx)
    unbiased_score_df = pd.DataFrame([knn_unbiased_score_ss, rf_unbiased_score_ss, svm_unbiased_score_ss],
                                    index=[KNN_TAG, RF_TAG, SVM_TAG])
    
    return normal_score_df, unbiased_score_df

def get_best_config_df(normal_score_df, unbiased_score_df):
    best_normal_knn = np.argmax(normal_score_df.loc[KNN_TAG]), round(max(normal_score_df.loc[KNN_TAG]),6)
    best_normal_rf = np.argmax(normal_score_df.loc[RF_TAG]), round(max(normal_score_df.loc[RF_TAG]),6)
    best_normal_svm = np.argmax(normal_score_df.loc[SVM_TAG]), round(max(normal_score_df.loc[SVM_TAG]),6)
    
    best_normal_df = pd.DataFrame([pd.Series({'Best_Normal': best_normal_knn[0], 'Score': best_normal_knn[1]}),
                                   pd.Series({'Best_Normal': best_normal_rf[0], 'Score': best_normal_rf[1]}),
                                   pd.Series({'Best_Normal': best_normal_svm[0], 'Score': best_normal_svm[1]})],
                                  index=[KNN_TAG,RF_TAG,SVM_TAG])

    best_unbiased_knn = np.argmax(unbiased_score_df.loc[KNN_TAG]), round(max(unbiased_score_df.loc[KNN_TAG]),6)
    best_unbiased_rf = np.argmax(unbiased_score_df.loc[RF_TAG]), round(max(unbiased_score_df.loc[RF_TAG]),6)
    best_unbiased_svm = np.argmax(unbiased_score_df.loc[SVM_TAG]), round(max(unbiased_score_df.loc[SVM_TAG]),6)
    
    best_unbiased_df = pd.DataFrame([pd.Series({'Best_Unbiased': best_unbiased_knn[0], 'Score': best_unbiased_knn[1]}),
                                     pd.Series({'Best_Unbiased': best_unbiased_rf[0], 'Score': best_unbiased_rf[1]}),
                                     pd.Series({'Best_Unbiased': best_unbiased_svm[0], 'Score': best_unbiased_svm[1]})],
                                    index=[KNN_TAG,RF_TAG,SVM_TAG])

    return pd.merge(best_normal_df, best_unbiased_df, how='inner', left_index=True, right_index=True)

def get_all_df(noise_folder,normal_file_name, unbiased_file_name,noise_strength):
    normal_file_path = PATH_STEP1 + noise_folder + normal_file_name
    unbiased_file_path = PATH_STEP1 + noise_folder + unbiased_file_name
    
    normal_score_df, unbiased_score_df = get_score_results_df(normal_file_path, unbiased_file_path, noise_strength)
    best_config_df = get_best_config_df(normal_score_df, unbiased_score_df)
    
    return normal_score_df, unbiased_score_df, best_config_df

# best_01_df, best_02_df, best_03_df, best_04_df
def get_final_df(best_dfs):
    knn_best_normal_configs, rf_best_normal_configs, svm_best_normal_configs = [], [], []
    knn_best_unbiased_configs, rf_best_unbiased_configs, svm_best_unbiased_configs = [], [], []
    #max(lis,key=lambda item:item[1])
    for _,best_df in enumerate(best_dfs):
        knn_best_normal_configs.append((best_df.loc[KNN_TAG]['Best_Normal'], best_df.loc[KNN_TAG]['Score_x']))
        rf_best_normal_configs.append((best_df.loc[RF_TAG]['Best_Normal'], best_df.loc[RF_TAG]['Score_x']))
        svm_best_normal_configs.append((best_df.loc[SVM_TAG]['Best_Normal'], best_df.loc[SVM_TAG]['Score_x']))
        
        knn_best_unbiased_configs.append((best_df.loc[KNN_TAG]['Best_Unbiased'], best_df.loc[KNN_TAG]['Score_y']))
        rf_best_unbiased_configs.append((best_df.loc[RF_TAG]['Best_Unbiased'], best_df.loc[RF_TAG]['Score_y']))
        svm_best_unbiased_configs.append((best_df.loc[SVM_TAG]['Best_Unbiased'], best_df.loc[SVM_TAG]['Score_y']))
    
    knn_best_normal = max(knn_best_normal_configs,key=lambda item:item[1])
    rf_best_normal = max(rf_best_normal_configs, key=lambda item:item[1])
    svm_best_normal = max(svm_best_normal_configs, key=lambda item:item[1])
    
    best_normal_config_df = pd.DataFrame([pd.Series({'Best_Normal': knn_best_normal[0], 'Score': knn_best_normal[1]}),
                                          pd.Series({'Best_Normal': rf_best_normal[0], 'Score': rf_best_normal[1]}),
                                          pd.Series({'Best_Normal': svm_best_normal[0], 'Score': svm_best_normal[1]})],
                                         index=[KNN_TAG, RF_TAG, SVM_TAG])
    
    knn_best_unbiased = max(knn_best_unbiased_configs, key=lambda item:item[1])
    rf_best_unbiased = max(rf_best_unbiased_configs, key=lambda item:item[1])
    svm_best_unbiased = max(svm_best_unbiased_configs, key=lambda item:item[1])
    
    best_unbiased_config_df = pd.DataFrame([pd.Series({'Best_Unbiased': knn_best_unbiased[0], 'Score': knn_best_unbiased[1]}),
                                          pd.Series({'Best_Unbiased': rf_best_unbiased[0], 'Score': rf_best_unbiased[1]}),
                                          pd.Series({'Best_Unbiased': svm_best_unbiased[0], 'Score': svm_best_unbiased[1]})],
                                         index=[KNN_TAG, RF_TAG, SVM_TAG])
    
    return pd.merge(best_normal_config_df, best_unbiased_config_df, how='inner', left_index=True, right_index=True)

In [3]:
## NOISE = 0.01 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with F1_MACRO score
NOISE_001_FOLDER = 'noise_001/'
NORMAL_F1_RESULTS_FILE = 'step1_eemd001_normal_f1_results.xml'
UNBIASED_F1_RESULTS_FILE = 'step1_eemd001_unbiased_f1_results.xml'

normal_001_score_df, unbiased_001_score_df, best_001_config_df = get_all_df(NOISE_001_FOLDER, NORMAL_F1_RESULTS_FILE,
                                                                            UNBIASED_F1_RESULTS_FILE, 0.01)

In [4]:
normal_001_score_df

Unnamed: 0,Normal_Acc_Noise_0.01_Imfs_4,Normal_Acc_Noise_0.01_Imfs_8,Normal_Acc_Noise_0.01_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.972506,0.972499,0.971792
RandomForestClassifier(random_state=1010),0.985125,0.988757,0.984828
SVC(random_state=1010),0.089938,0.084745,0.069762


In [5]:
unbiased_001_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.01_Imfs_4,Unbiased_Acc_Noise_0.01_Imfs_8,Unbiased_Acc_Noise_0.01_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.481342,0.481555,0.486343
RandomForestClassifier(random_state=1010),0.509389,0.566877,0.52027
SVC(random_state=1010),0.09715,0.090845,0.057557


In [6]:
best_001_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.01_Imfs_4,0.972506,Unbiased_Acc_Noise_0.01_Imfs_16,0.486343
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.01_Imfs_8,0.988757,Unbiased_Acc_Noise_0.01_Imfs_8,0.566877
SVC(random_state=1010),Normal_Acc_Noise_0.01_Imfs_4,0.089938,Unbiased_Acc_Noise_0.01_Imfs_4,0.09715


In [7]:
## NOISE = 0.05 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with F1_MACRO score
NOISE_005_FOLDER = 'noise_005/'
NORMAL_F1_RESULTS_FILE = 'step1_eemd005_normal_f1_results.xml'
UNBIASED_F1_RESULTS_FILE = 'step1_eemd005_unbiased_f1_results.xml'

normal_005_score_df, unbiased_005_score_df, best_005_config_df = get_all_df(NOISE_005_FOLDER, NORMAL_F1_RESULTS_FILE,
                                                                            UNBIASED_F1_RESULTS_FILE, 0.05)

In [8]:
normal_005_score_df

Unnamed: 0,Normal_Acc_Noise_0.05_Imfs_4,Normal_Acc_Noise_0.05_Imfs_8,Normal_Acc_Noise_0.05_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.971892,0.97221,0.97221
RandomForestClassifier(random_state=1010),0.988529,0.990739,0.98839
SVC(random_state=1010),0.090133,0.084779,0.069361


In [9]:
unbiased_005_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.05_Imfs_4,Unbiased_Acc_Noise_0.05_Imfs_8,Unbiased_Acc_Noise_0.05_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.481007,0.481667,0.482236
RandomForestClassifier(random_state=1010),0.524898,0.576789,0.537041
SVC(random_state=1010),0.097124,0.091024,0.057307


In [10]:
best_005_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.05_Imfs_8,0.97221,Unbiased_Acc_Noise_0.05_Imfs_16,0.482236
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.05_Imfs_8,0.990739,Unbiased_Acc_Noise_0.05_Imfs_8,0.576789
SVC(random_state=1010),Normal_Acc_Noise_0.05_Imfs_4,0.090133,Unbiased_Acc_Noise_0.05_Imfs_4,0.097124


In [11]:
## NOISE = 0.1 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with F1_MACRO score
NOISE_01_FOLDER = 'noise_01/'
NORMAL_F1_RESULTS_FILE = 'step1_eemd01_normal_f1_results.xml'
UNBIASED_F1_RESULTS_FILE = 'step1_eemd01_unbiased_f1_results.xml'

normal_01_score_df, unbiased_01_score_df, best_01_config_df = get_all_df(NOISE_01_FOLDER, NORMAL_F1_RESULTS_FILE,
                                                                            UNBIASED_F1_RESULTS_FILE, 0.1)

In [12]:
normal_01_score_df

Unnamed: 0,Normal_Acc_Noise_0.1_Imfs_4,Normal_Acc_Noise_0.1_Imfs_8,Normal_Acc_Noise_0.1_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.971684,0.97189,0.972261
RandomForestClassifier(random_state=1010),0.989544,0.99263,0.989953
SVC(random_state=1010),0.090415,0.085314,0.068917


In [13]:
unbiased_01_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.1_Imfs_4,Unbiased_Acc_Noise_0.1_Imfs_8,Unbiased_Acc_Noise_0.1_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.483648,0.489975,0.484465
RandomForestClassifier(random_state=1010),0.515555,0.611008,0.5679
SVC(random_state=1010),0.097593,0.091635,0.057325


In [14]:
best_01_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.1_Imfs_16,0.972261,Unbiased_Acc_Noise_0.1_Imfs_8,0.489975
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.1_Imfs_8,0.99263,Unbiased_Acc_Noise_0.1_Imfs_8,0.611008
SVC(random_state=1010),Normal_Acc_Noise_0.1_Imfs_4,0.090415,Unbiased_Acc_Noise_0.1_Imfs_4,0.097593


In [15]:
## NOISE = 0.2 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with F1_MACRO score
NOISE_02_FOLDER = 'noise_02/'
NORMAL_F1_RESULTS_FILE = 'step1_eemd02_normal_f1_results.xml'
UNBIASED_F1_RESULTS_FILE = 'step1_eemd02_unbiased_f1_results.xml'

normal_02_score_df, unbiased_02_score_df, best_02_config_df = get_all_df(NOISE_02_FOLDER, NORMAL_F1_RESULTS_FILE,
                                                                            UNBIASED_F1_RESULTS_FILE, 0.2)

In [16]:
normal_02_score_df

Unnamed: 0,Normal_Acc_Noise_0.2_Imfs_4,Normal_Acc_Noise_0.2_Imfs_8,Normal_Acc_Noise_0.2_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.970456,0.970175,0.970175
RandomForestClassifier(random_state=1010),0.990705,0.994134,0.992721
SVC(random_state=1010),0.090894,0.086313,0.068395


In [17]:
unbiased_02_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.2_Imfs_4,Unbiased_Acc_Noise_0.2_Imfs_8,Unbiased_Acc_Noise_0.2_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.480173,0.482934,0.480261
RandomForestClassifier(random_state=1010),0.534227,0.636837,0.596439
SVC(random_state=1010),0.099432,0.093027,0.057261


In [18]:
best_02_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.2_Imfs_4,0.970456,Unbiased_Acc_Noise_0.2_Imfs_8,0.482934
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.2_Imfs_8,0.994134,Unbiased_Acc_Noise_0.2_Imfs_8,0.636837
SVC(random_state=1010),Normal_Acc_Noise_0.2_Imfs_4,0.090894,Unbiased_Acc_Noise_0.2_Imfs_4,0.099432


In [19]:
## NOISE = 0.3 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with F1_MACRO score
NOISE_03_FOLDER = 'noise_03/'
NORMAL_F1_RESULTS_FILE = 'step1_eemd03_normal_f1_results.xml'
UNBIASED_F1_RESULTS_FILE = 'step1_eemd03_unbiased_f1_results.xml'

normal_03_score_df, unbiased_03_score_df, best_03_config_df = get_all_df(NOISE_03_FOLDER, NORMAL_F1_RESULTS_FILE,
                                                                            UNBIASED_F1_RESULTS_FILE, 0.3)

In [20]:
normal_03_score_df

Unnamed: 0,Normal_Acc_Noise_0.3_Imfs_4,Normal_Acc_Noise_0.3_Imfs_8,Normal_Acc_Noise_0.3_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.968895,0.966776,0.96773
RandomForestClassifier(random_state=1010),0.991455,0.994675,0.992547
SVC(random_state=1010),0.091477,0.086894,0.067901


In [21]:
unbiased_03_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.3_Imfs_4,Unbiased_Acc_Noise_0.3_Imfs_8,Unbiased_Acc_Noise_0.3_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.484454,0.486402,0.485125
RandomForestClassifier(random_state=1010),0.528233,0.63472,0.612441
SVC(random_state=1010),0.10045,0.093522,0.05702


In [22]:
best_03_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.3_Imfs_4,0.968895,Unbiased_Acc_Noise_0.3_Imfs_8,0.486402
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.3_Imfs_8,0.994675,Unbiased_Acc_Noise_0.3_Imfs_8,0.63472
SVC(random_state=1010),Normal_Acc_Noise_0.3_Imfs_4,0.091477,Unbiased_Acc_Noise_0.3_Imfs_4,0.10045


In [23]:
## NOISE = 0.4 --> Analyzing the best EEMD number of IMFs for both NORMAL and UNBIASED experiments with F1_MACRO score
NOISE_04_FOLDER = 'noise_04/'
NORMAL_F1_RESULTS_FILE = 'step1_eemd04_normal_f1_results.xml'
UNBIASED_F1_RESULTS_FILE = 'step1_eemd04_unbiased_f1_results.xml'

normal_04_score_df, unbiased_04_score_df, best_04_config_df = get_all_df(NOISE_04_FOLDER, NORMAL_F1_RESULTS_FILE,
                                                                            UNBIASED_F1_RESULTS_FILE, 0.4)

In [24]:
normal_04_score_df

Unnamed: 0,Normal_Acc_Noise_0.4_Imfs_4,Normal_Acc_Noise_0.4_Imfs_8,Normal_Acc_Noise_0.4_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.966819,0.96635,0.96635
RandomForestClassifier(random_state=1010),0.990743,0.993748,0.99323
SVC(random_state=1010),0.091704,0.087554,0.066465


In [25]:
unbiased_04_score_df

Unnamed: 0,Unbiased_Acc_Noise_0.4_Imfs_4,Unbiased_Acc_Noise_0.4_Imfs_8,Unbiased_Acc_Noise_0.4_Imfs_16
KNeighborsClassifier(n_neighbors=1),0.477567,0.482634,0.47936
RandomForestClassifier(random_state=1010),0.530108,0.642929,0.594332
SVC(random_state=1010),0.10134,0.094558,0.056769


In [26]:
best_04_config_df

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.4_Imfs_4,0.966819,Unbiased_Acc_Noise_0.4_Imfs_8,0.482634
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.4_Imfs_8,0.993748,Unbiased_Acc_Noise_0.4_Imfs_8,0.642929
SVC(random_state=1010),Normal_Acc_Noise_0.4_Imfs_4,0.091704,Unbiased_Acc_Noise_0.4_Imfs_4,0.10134


In [27]:
best_configs = [best_001_config_df, best_005_config_df, best_01_config_df,
                best_02_config_df, best_03_config_df,best_04_config_df]
get_final_df(best_configs)

Unnamed: 0,Best_Normal,Score_x,Best_Unbiased,Score_y
KNeighborsClassifier(n_neighbors=1),Normal_Acc_Noise_0.01_Imfs_4,0.972506,Unbiased_Acc_Noise_0.1_Imfs_8,0.489975
RandomForestClassifier(random_state=1010),Normal_Acc_Noise_0.3_Imfs_8,0.994675,Unbiased_Acc_Noise_0.4_Imfs_8,0.642929
SVC(random_state=1010),Normal_Acc_Noise_0.4_Imfs_4,0.091704,Unbiased_Acc_Noise_0.4_Imfs_4,0.10134
