In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

import warnings
# Ignorar os warnings
warnings.filterwarnings("ignore")

import time

#PYOD
from pyod.models.knn import KNN 
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.inne import INNE

## PYOD

In [2]:
def def_metricas(test_y, y_test_pred, y_test_scores):
    # Calcula as métricas de avaliação
    precision = precision_score(test_y, y_test_pred)
    recall = recall_score(test_y, y_test_pred)
    f1 = f1_score(test_y, y_test_pred)
    accuracy = accuracy_score(test_y, y_test_pred)
    auc = roc_auc_score(test_y, y_test_scores)

    metricas = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall':recall,
        'F1':f1,
        'AUC':auc
    }
    
    return metricas

In [3]:
def print_results(user, clf_name, best_param, best_windows, results):
    print("Usuário:", user)
    print("Modelo:", clf_name)
    print("Melhor Parametro:", best_param)
    print("Janela:", best_windows)
    print(f"Acurácia: {results['Accuracy']}")
    print(f"Precisão: {results['Precision']}")
    print(f"Recall: {results['Recall']}")
    print(f"F1: {results['F1']}")
    print(f"AUC: {results['AUC']}")

In [4]:
def get_model(model_name, params_dict):
        
    if model_name == 'KNN':
        model = KNN(**params_dict)
    if model_name == 'OCSVM':
        model = OCSVM(**params_dict)
    if model_name == 'LOF':
        model = LOF(**params_dict)
    if model_name == 'INNE':
        model = INNE(**params_dict)
    
    print(f"Modelo: {model}")
    return model

def process_data(genuino, impostor, validacao):
    start_process = time.time()
    # Salvar o novo dataset com as janelas

    train_df = genuino.copy()
    test_df = impostor.copy()
    validacao_df = validacao.copy()
        
    scaler = StandardScaler()

    # Normalização
    cols = train_df.columns
    sc_train = scaler.fit_transform(train_df.values)
    sc_test = scaler.transform(test_df.values)
    sc_validacao = scaler.transform(validacao_df.values)

    # Salvar a normalização
    train_df = pd.DataFrame(sc_train, columns=cols)
    test_df = pd.DataFrame(sc_test, columns=cols)
    validacao_df = pd.DataFrame(sc_validacao, columns=cols)

    # Teste
    n = round(len(train_df) * 0.30)
    add = train_df.tail(n)  # 30%
    # Apagar as linhas do df de treino que foram utilizadas no teste
    train_df.drop(add.index, inplace=True)
    
    test_df = test_df.append(add, ignore_index=True)

    #Validação
    n = round(len(train_df) * 0.20)
    add2 = train_df.tail(n) #20%

    val_df = pd.concat([add2, validacao_df], ignore_index=True)
    # Apaga as linhas do df de treino que foram utilizadas no teste
    train_df.drop(add2.index, inplace=True)

    
    end_process = time.time()
    print("Tempo de execução do processamento: ", end_process - start_process)

    return train_df, val_df, test_df

# Recebe modelo, dados e retorna metricas da execucao do modelo
def exec_model(model, train_df, val_test_df):
    start_exec = time.time()

    train = train_df.copy()
    val_test = val_test_df.copy()

    # Remove label do treino
    train = train.iloc[:,:-1]

    # Coleta e remove label do set de validacao ou teste
    val_test_y = val_test['label']
    val_test = val_test.iloc[:,:-1]

    model.fit(train)

    y_val_test_pred = model.predict(val_test)  # Outlier labels (0 ou 1)
    y_val_test_scores = model.decision_function(val_test)  # Outlier scores

    result_metrics = def_metricas(val_test_y, y_val_test_pred, y_val_test_scores)

    end_exec = time.time()
    print("Tempo de execução do modelo: ", end_exec - start_exec)

    return result_metrics

def best_exec(metrics):
    f1_scores = [metric['scores']['F1'] for metric in metrics]
    best_param_idx = np.argmax(f1_scores)
    return metrics[best_param_idx].values()

def grid_search(model_name, param_grid, processed_data_windows, window_size_list):
    start_grid = time.time()
    metrics = []
    param_grid = ParameterGrid(param_grid)
    for window_size in window_size_list:
        print(f"---> Executando com a janela {window_size}...")
        #train_df, val_df, _ = process_data(genuine, impostor, window_size)
        train_df, val_df, _ =  processed_data_windows[window_size]
        
        for params_dict in param_grid:
            print(f"Executando com os parametros {params_dict}...")
            model = get_model(model_name, params_dict)
            scores = exec_model(model, train_df, val_df)
            print(f"Obteve o f1_score = {scores['F1']} \n ---------------------- ")
            metrics.append({'scores': scores, 'params': params_dict, 'window_size': window_size})

    end_grid = time.time()
    print("Tempo de execução GridSearch: ", end_grid - start_grid)
    
    return metrics


In [6]:
# Lista de usuarios
users = [
    'enrolment_01',
    'enrolment_02',
    'enrolment_03',
    'enrolment_04',
    'enrolment_05',
    'enrolment_06',
    'enrolment_07',
    'enrolment_08',
    'enrolment_09',
    'enrolment_10',
    'enrolment_11',
    'enrolment_12',
    'enrolment_13',
    'enrolment_14',
    'enrolment_15',
    'enrolment_16',
    'enrolment_17',
    'enrolment_18',
    'enrolment_19',
    'enrolment_20'
]

# Lista de tamanhos de janela
window_size_list = [1,2,3,4,5,6,7]

# Lista de parâmetros para cada modelo
models_params_grid = {
    'KNN': {
        'n_neighbors': [5, 10, 15],
    },
    'OCSVM': {
        'nu': [0.1, 0.2, 0.3],
    },
    'LOF': {
        'n_neighbors': [5, 10, 15],
    },
    'INNE': {
        'n_estimators': [80, 100, 125],  
    }
}

user_models_results = {}

for user in users:

    user_models_results[user] = {}
    processed_data_windows = {} #salva as janelas

    for window_size in window_size_list:

        genuine = pd.read_csv('datasets_random/' + user + '_train_data_window_' + str(window_size) + '.csv')
        impostor = pd.read_csv('datasets_random/' + user + '_test_data_window_' + str(window_size) + '.csv')
        validacao = pd.read_csv('datasets_validation/val_data_window_' + str(window_size) + '.csv') 

        print(f"Processando os dados do usuário: {user} com tamanho de janela: {window_size}")

        train_df, val_df, test_df = process_data(genuine, impostor, validacao, window_size)
        processed_data_windows[window_size] = (train_df, val_df, test_df)


    for clf_name in models_params_grid:
        print(f"============================================== \n Executando o modelo {clf_name} para o usuario {user} com os parametros {models_params_grid[clf_name]}...")

        # Grid search with val set
        metrics = grid_search(clf_name, models_params_grid[clf_name], processed_data_windows, window_size_list)

        #TODO: criar uma funcao para visualizar as metricas por janela por modelo por hiperparametro
        #plota_metricas(metrics)

        # Test exec
        _, best_params, best_window_size = best_exec(metrics)
        
        print("\n=========================TESTE===============================")
        print(f"============================================== \n Executando o modelo {clf_name} para o usuario {user} com os parametros {best_params} e janela {best_window_size}...")

        model = get_model(clf_name, best_params)

        # train_df, _, test_df = process_data(genuine, impostor, best_window_size)
        train_df, _, test_df = processed_data_windows[best_window_size]

        results = exec_model(model, train_df, test_df)
        
        user_models_results[user][clf_name] = {}
        user_models_results[user][clf_name]['results'] = results
        user_models_results[user][clf_name]['best_params'] = best_params
        user_models_results[user][clf_name]['best_window_size'] = best_window_size
        
        
        print("\n========================================================\n")
        print("\nResultados finais: \n")
        print_results(user, clf_name, best_params, best_window_size, results)
        print("\n========================================================\n")

Processando os dados do usuário: enrolment_01 com tamanho de janela: 1
Tempo de execução do processamento:  0.02299976348876953
Processando os dados do usuário: enrolment_01 com tamanho de janela: 2
Tempo de execução do processamento:  0.013163089752197266
Processando os dados do usuário: enrolment_01 com tamanho de janela: 3
Tempo de execução do processamento:  0.011955976486206055
Processando os dados do usuário: enrolment_01 com tamanho de janela: 4
Tempo de execução do processamento:  0.011533498764038086
Processando os dados do usuário: enrolment_01 com tamanho de janela: 5
Tempo de execução do processamento:  0.011998891830444336
Processando os dados do usuário: enrolment_01 com tamanho de janela: 6
Tempo de execução do processamento:  0.012508630752563477
Processando os dados do usuário: enrolment_01 com tamanho de janela: 7
Tempo de execução do processamento:  0.01216578483581543
 Executando o modelo KNN para o usuario enrolment_01 com os parametros {'n_neighbors': [5, 10, 15]}

In [8]:
import pprint

pprint.pprint(user_models_results)

{'enrolment_01': {'INNE': {'best_params': {'n_estimators': 80},
                           'best_window_size': 1,
                           'results': {'AUC': 0.992023928215354,
                                       'Accuracy': 0.9519650655021834,
                                       'F1': 0.9686609686609687,
                                       'Precision': 0.9392265193370166,
                                       'Recall': 1.0}},
                  'KNN': {'best_params': {'n_neighbors': 15},
                          'best_window_size': 1,
                          'results': {'AUC': 1.0,
                                      'Accuracy': 1.0,
                                      'F1': 1.0,
                                      'Precision': 1.0,
                                      'Recall': 1.0}},
                  'LOF': {'best_params': {'n_neighbors': 15},
                          'best_window_size': 1,
                          'results': {'AUC': 1.0,
                    

In [9]:
best_window_sizes_by_user = {}
for user in user_models_results:
    best_window_sizes_by_user[user] = {}
    for clf_name in user_models_results[user]:
        try:
            best_window_sizes_by_user[user][user_models_results[user][clf_name]['best_window_size']] += 1
        except KeyError:
            best_window_sizes_by_user[user][user_models_results[user][clf_name]['best_window_size']] = 1

best_window_sizes_by_user

{'enrolment_01': {1: 3, 7: 1},
 'enrolment_02': {5: 1, 7: 2, 3: 1},
 'enrolment_03': {1: 3, 2: 1},
 'enrolment_04': {4: 1, 1: 3},
 'enrolment_05': {5: 1, 1: 3},
 'enrolment_06': {4: 1, 7: 3},
 'enrolment_07': {7: 4},
 'enrolment_08': {7: 4},
 'enrolment_09': {1: 4},
 'enrolment_10': {7: 1, 5: 1, 1: 1, 3: 1},
 'enrolment_11': {1: 3, 4: 1},
 'enrolment_12': {2: 1, 7: 1, 6: 1, 4: 1},
 'enrolment_13': {7: 3, 2: 1},
 'enrolment_14': {7: 4},
 'enrolment_15': {2: 2, 1: 1, 7: 1},
 'enrolment_16': {7: 2, 5: 1, 1: 1},
 'enrolment_17': {7: 3, 3: 1},
 'enrolment_18': {2: 1, 6: 1, 3: 2},
 'enrolment_19': {1: 2, 2: 2},
 'enrolment_20': {2: 1, 5: 1, 1: 1, 4: 1}}

In [10]:
global_best_window_sizes = {}

for user in best_window_sizes_by_user:
    for window_size in best_window_sizes_by_user[user]:
        try:
            global_best_window_sizes[window_size] += best_window_sizes_by_user[user][window_size]
        except KeyError:
            global_best_window_sizes[window_size] = best_window_sizes_by_user[user][window_size]

global_best_window_sizes

{1: 25, 7: 29, 5: 5, 3: 5, 2: 9, 4: 5, 6: 2}

In [11]:
accuracy = []
auc = []
precision = []
f1 = []
recall = []

windows = []

metrics_by_windows = {}

for clf_name in models_params_grid:
    metrics_by_windows[clf_name] = {}
    for user in user_models_results:
        auc.append(user_models_results[user][clf_name]['results']['AUC'])
        accuracy.append(user_models_results[user][clf_name]['results']['Accuracy'])
        f1.append(user_models_results[user][clf_name]['results']['F1'])
        precision.append(user_models_results[user][clf_name]['results']['Precision'])
        recall.append(user_models_results[user][clf_name]['results']['Recall'])
        windows.append(user_models_results[user][clf_name]['best_window_size'])
    
    for i in range(len(windows)):
        try:
            metrics_by_windows[clf_name][windows[i]]['auc'].append(auc[i])
            metrics_by_windows[clf_name][windows[i]]['accuracy'].append(accuracy[i])
            metrics_by_windows[clf_name][windows[i]]['f1'].append(f1[i])
            metrics_by_windows[clf_name][windows[i]]['precision'].append(precision[i])
            metrics_by_windows[clf_name][windows[i]]['recall'].append(recall[i])
        except KeyError:
            metrics_by_windows[clf_name][windows[i]] = {'auc': [auc[i]], 'accuracy': [accuracy[i]], 'f1': [f1[i]], 'precision': [precision[i]], 'recall': [recall[i]]}

    print(clf_name)
    print(" Média AUC: ", np.mean(auc),
        "\n Média Accuracy: ", np.mean(accuracy),
        "\n Média F1-Score: ", np.mean(f1),
        "\n Média Precision: ", np.mean(precision),
        "\n Média Recall: ", np.mean(recall),
        "\n============================"        
        )

KNN
 Média AUC:  0.99252358888214 
 Média Accuracy:  0.9200877121590463 
 Média F1-Score:  0.9373542049012519 
 Média Precision:  0.9333904905821502 
 Média Recall:  0.9616373796892365 
OCSVM
 Média AUC:  0.9943941134066782 
 Média Accuracy:  0.916603072028952 
 Média F1-Score:  0.9413767322239235 
 Média Precision:  0.9181141546560656 
 Média Recall:  0.9785633760712831 
LOF
 Média AUC:  0.9924259920027634 
 Média Accuracy:  0.9123398860926047 
 Média F1-Score:  0.9406933123156574 
 Média Precision:  0.9092669461267822 
 Média Recall:  0.9844688194073591 
INNE
 Média AUC:  0.9845670099726892 
 Média Accuracy:  0.9122700895534234 
 Média F1-Score:  0.9416819342794195 
 Média Precision:  0.9073215900893729 
 Média Recall:  0.9872641611564459 


In [12]:
pprint.pprint(metrics_by_windows)

{'INNE': {1: {'accuracy': [1.0,
                           0.9358407079646017,
                           0.8275862068965517,
                           0.9408983451536643,
                           0.9822335025380711,
                           0.8495575221238938,
                           0.888268156424581,
                           0.8864265927977839,
                           0.8029556650246306,
                           0.7880085653104925,
                           0.9868995633187773,
                           0.8910614525139665,
                           0.9141274238227147,
                           0.8103448275862069,
                           0.9281914893617021,
                           0.8841607565011821,
                           0.9873096446700508,
                           0.8183520599250936,
                           0.9519650655021834,
                           0.8296460176991151,
                           0.8854748603351955,
                           0.

1.0,
                      0.8926461786671525,
                      0.9877282544669153,
                      0.9944373127941806,
                      1.0,
                      0.9850237554224334,
                      0.9817656242376933,
                      1.0,
                      1.0,
                      1.0,
                      1.0,
                      0.9850503525982982,
                      0.9828896805138705,
                      1.0,
                      0.9772774220202438,
                      0.9237327413767868,
                      1.0,
                      1.0,
                      1.0,
                      1.0,
                      0.9396230119772236,
                      0.9824416442883702,
                      0.9103710299068157,
                      1.0,
                      0.9864864864864865],
              'f1': [0.9425556858147714,
                     1.0,
                     0.9776119402985075,
                     1.0,
                 

In [13]:
for model in metrics_by_windows:
     for janela in metrics_by_windows[model]:
           print(f"\nAcurácia média (usuarios) do modelo {model} na janela {janela}")
           print(np.mean(metrics_by_windows[model][janela]['accuracy']))

           print(f"\nAUC média (usuarios) do modelo {model} na janela {janela}")
           print(np.mean(metrics_by_windows[model][janela]['auc']))

           print(f"\nF1-Score média (usuarios) do modelo {model} na janela {janela}")
           print(np.mean(metrics_by_windows[model][janela]['f1']))

           print(f"\nPrecisão média (usuarios) do modelo {model} na janela {janela}")
           print(np.mean(metrics_by_windows[model][janela]['precision']))

           print(f"\nRecall média (usuarios) do modelo {model} na janela {janela}")
           print(np.mean(metrics_by_windows[model][janela]['recall']))

           print("\n---------------------------------------------------")


Acurácia média (usuarios) do modelo KNN na janela 1
0.9373117525105776

AUC média (usuarios) do modelo KNN na janela 1
1.0

F1-Score média (usuarios) do modelo KNN na janela 1
0.9609974018874456

Precisão média (usuarios) do modelo KNN na janela 1
0.9270966615089489

Recall média (usuarios) do modelo KNN na janela 1
1.0

---------------------------------------------------

Acurácia média (usuarios) do modelo KNN na janela 5
0.9789325842696629

AUC média (usuarios) do modelo KNN na janela 5
1.0

F1-Score média (usuarios) do modelo KNN na janela 5
0.9871355060034306

Precisão média (usuarios) do modelo KNN na janela 5
0.9749163879598662

Recall média (usuarios) do modelo KNN na janela 5
1.0

---------------------------------------------------

Acurácia média (usuarios) do modelo KNN na janela 4
0.9511170614107731

AUC média (usuarios) do modelo KNN na janela 4
0.9965107154802028

F1-Score média (usuarios) do modelo KNN na janela 4
0.9700325697943328

Precisão média (usuarios) do modelo 

In [14]:
for model in metrics_by_windows:
     for janela in metrics_by_windows[model]:
           print(f"Modelo {model} na janela {janela}\n")
           
           print(np.mean(metrics_by_windows[model][janela]['auc']))
           
           print(np.mean(metrics_by_windows[model][janela]['accuracy']))

           print(np.mean(metrics_by_windows[model][janela]['f1']))

           print(np.mean(metrics_by_windows[model][janela]['precision']))

           print(np.mean(metrics_by_windows[model][janela]['recall']))

           print("\n---------------------------------------------------")

Modelo KNN na janela 1

1.0
0.9373117525105776
0.9609974018874456
0.9270966615089489
1.0

---------------------------------------------------
Modelo KNN na janela 5

1.0
0.9789325842696629
0.9871355060034306
0.9749163879598662
1.0

---------------------------------------------------
Modelo KNN na janela 4

0.9965107154802028
0.9511170614107731
0.9700325697943328
0.9422251902217905
1.0

---------------------------------------------------
Modelo KNN na janela 7

0.9796357638117702
0.8945597995207771
0.9006753954974257
0.955703143506153
0.8903925133978187

---------------------------------------------------
Modelo KNN na janela 2

1.0
0.8982943981554308
0.9307582921275764
0.8770303357979696
1.0

---------------------------------------------------
Modelo OCSVM na janela 1

1.0
0.8901775264234271
0.9343245655046826
0.8794314098112765
1.0

---------------------------------------------------
Modelo OCSVM na janela 5

0.9990315082644627
0.9636472913158306
0.9758500352839976
0.9533927524738821
