In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from pyod.utils.utility import precision_n_scores
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import column_or_1d
from sklearn.utils.multiclass import type_of_target

In [None]:
def modify_obs_seq(df, perc_rows_2_modif, perc_items_2_modif):
    # Randomly select x% of rows
    num_rows_to_modify = int(len(df) * perc_rows_2_modif / 100)
    rows_to_modify = np.random.choice(df.index, num_rows_to_modify, replace=False)

    # Function to modify 'Obs_seq' for selected rows
    def modify_sequence(seq):
        # Calculate the number of items to modify in the sequence
        num_items_to_modify = max(1, int(len(seq) * perc_items_2_modif / 100))
        # Randomly select indices to modify in the sequence
        indices_to_modify = np.random.choice(len(seq), num_items_to_modify, replace=False)

        # Create random pairs of indices and swap their values
        for i in range(0, len(indices_to_modify) - 1, 2):
            seq[indices_to_modify[i]], seq[indices_to_modify[i + 1]] = (
                seq[indices_to_modify[i + 1]], seq[indices_to_modify[i]]
            )

        return seq

    # Modify 'Obs_seq' column for selected rows and create 'is_ano' column
    df['is_ano'] = 0  # Initialize 'is_ano' column with 0

    # Apply modification function to 'Obs_seq' for selected rows
    df.loc[rows_to_modify, 'Obs_seq'] = df.loc[rows_to_modify, 'Obs_seq'].apply(modify_sequence)
    df.loc[rows_to_modify, 'is_ano'] = 1  # Label modified rows with 1 in 'is_ano' column

    return df

def format_results(cv_results, mean_score_col, std_score):
    results=pd.DataFrame.from_dict(clf.cv_results_)[[parameter_tested_col,mean_score_col,std_score]]
    results['combined_scores'] = results.apply(lambda row: f"{round(row[mean_score_col],3)} \u00B1 ({round(row[std_score],3)})", axis=1)
    df_result = results[[parameter_tested_col, 'combined_scores']].T
    df_result.columns = df_result.iloc[0]
    df_result.columns.name = None
    df_result = df_result[1:]
    df_result.at[df_result.index[0], 'Dataset'] = file_name
    df_result=df_result.set_index('Dataset')
    return df_result
# df2 = modify_obs_seq(data_seq,10,50)
# display(df2)

In [None]:
import pandas as pd
import numpy as np
from seq2patterns import Seq2patterns
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import os
os.environ["OMP_NUM_THREADS"] = "4"

from sklearn.base import BaseEstimator, ClassifierMixin
class MyAnomalyDetectionObjectFromPM(BaseEstimator, ClassifierMixin): 
    def __init__(self,nb_of_frequent_patterns, min_len_of_frequent_pattern, n_clust, seq_ano_perc, item_ano_perc, ano_method, algo_clustering, aggreg_method):
        
        # for pattern mining
        self.nb_of_frequent_patterns = nb_of_frequent_patterns
        self.min_len_of_frequent_pattern = min_len_of_frequent_pattern
        self.kmeans_is_closed = None
        self.n_clust = n_clust
        self.algo_clustering=algo_clustering
        self.aggreg_method=aggreg_method
        
        # for anomaly detection
        self.seq_ano_perc = seq_ano_perc
        self.item_ano_perc= item_ano_perc
        self.ano_method = ano_method
        
        # model training object
        self.seq2patterns_instance = None
        self.max_len_seq=None
        

    def get_params(self, deep=True): 
        return {
        "nb_of_frequent_patterns":self.nb_of_frequent_patterns,
        "min_len_of_frequent_pattern":self.min_len_of_frequent_pattern,
        # "kmeans_is_closed":self.kmeans_is_closed,
        "algo_clustering":self.algo_clustering,
        "aggreg_method":self.aggreg_method,
        "n_clust":self.n_clust,
        "seq_ano_perc":self.seq_ano_perc,
        "item_ano_perc":self.item_ano_perc,
        "ano_method":self.ano_method
        }

    def set_params(self, **parameters): 
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self 
        
       
    def fit(self, X,y): 
        self.classes_ = np.unique(y, return_inverse=False)
        
        self.max_len_seq = max(X['Obs_seq'].apply(len))
        
        X = X[["Entite", "Obs_seq","Intervals_seq"]] 
        # y=None
        
        
        
        if self.ano_method in ['WCFPOF','WCFPOF_clust']:
            self.kmeans_is_closed==True
        else:
            self.kmeans_is_closed==False
        
        if self.ano_method in ['FPOF','WCFPOF','LFPOF']:
            self.n_clust=1
            
        self.seq2patterns_instance = Seq2patterns(nb_of_frequent_patterns = self.nb_of_frequent_patterns, 
                                                  min_len_of_frequent_pattern = self.min_len_of_frequent_pattern, 
                                                  kmeans_is_closed = self.kmeans_is_closed, 
                                                  n_clust = self.n_clust,
                                                  algo_clustering = self.algo_clustering,
                                                  aggreg_method=self.aggreg_method)
        self.seq2patterns_instance.fit(X)
        return 'fitted'

    def decision_function(self, X): 

        patterns_X = self.seq2patterns_instance.transform(X)
        # pour la jointure après on utilise comme clé le numéro de row, càd l'index.
        X.reset_index(inplace=True)  # Resetting the index
        X['Entite'] = X.index  # Creating 'Entite' column using reset index values


        
        # add the empty pattern to all cases in the patterns table (used to return all cases, not only those with frequent patterns)
        case_distinct_values_list = X['Entite'].unique().tolist()

        # Create an empty list to store dictionaries for new rows
        new_rows = []
        
        # Loop through each distinct value and create a dictionary for each value
        for case in case_distinct_values_list:
            new_row = {'Combination': 'Null_Patn', 'seqIndex': case, 'Cluster': 0, 'freq': 1}
            new_rows.append(new_row)  # Append the dictionary to the list

        # Concatenate the list of dictionaries with the existing DataFrame using pd.concat()
        if new_rows:
            patterns_X = pd.concat([patterns_X, pd.DataFrame(new_rows)], ignore_index=True)
                

            
        X["len_seq"]=X['Obs_seq'].apply(len) # add len of sequence column
        patterns_X = patterns_X.merge(X[['len_seq']], left_on='seqIndex', right_index=True, how='left') # add it to results for further calculations
        
        patterns_X['len_pattern'] = patterns_X['Combination'].apply(lambda x: len(str(x).split('[')[1].split(']')[0].split(',')) if '[' in str(x) and ']' in str(x) else 0)
        
        var_sum_freq = patterns_X[['Combination', 'freq']].drop_duplicates()['freq'].sum()
        max_len_seq2 = max(self.max_len_seq,max(X['Obs_seq'].apply(len)))
        
        
        try:
            
            if self.ano_method == 'FPOF': # sum freq/nb total patterns
                return patterns_X.groupby('seqIndex').apply(lambda x: 1-(x['freq'].sum()/((self.nb_of_frequent_patterns)+1))).tolist() # +1 car on a le pattern vide
            
            elif self.ano_method == 'FPOF_clust': # sum freq/nb total patterns
                return patterns_X.groupby('seqIndex').apply(lambda x: 1-(x['freq'].sum()/((self.nb_of_frequent_patterns)+1))).tolist() # +1 car on a le pattern vide
            
            elif self.ano_method == 'WCFPOF': # sum freq/nb total patterns(avec patterns fermés)
                # return patterns_X.groupby('seqIndex').apply(lambda x: 1-(x['freq'].sum()/((self.nb_of_frequent_patterns)+1))).tolist() # +1 car on a le pattern vide
                return patterns_X.groupby('seqIndex').apply(lambda x: 1 - ((x['freq'] * (x['len_pattern'] / x['len_seq'])).sum() / (self.nb_of_frequent_patterns + 1))).tolist()
            
            elif self.ano_method == 'WCFPOF_clust': # sum freq/nb total patterns(avec patterns fermés)
                # return patterns_X.groupby('seqIndex').apply(lambda x: 1-(x['freq'].sum()/((self.nb_of_frequent_patterns)+1))).tolist() # +1 car on a le pattern vide
                return patterns_X.groupby('seqIndex').apply(lambda x: 1 - ((x['freq'] * (x['len_pattern'] / x['len_seq'])).sum() / (self.nb_of_frequent_patterns + 1))).tolist()
       
        except ValueError:
            print("ERREUR sur les méthodes de calcul de score d'anomalie")
            
            
        
    def predict_proba(self, X): 
        
        return self.decision_function(X)
    
    def predict(self, X):
        def create_top_x_percent_list(input_list, x):
            sorted_list = sorted(input_list, reverse=True)
            threshold_index = int(len(sorted_list) * (x / 100))
            threshold_value = sorted_list[threshold_index]

            new_list = [1 if val >= threshold_value else 0 for val in input_list]
            return new_list
        
        return create_top_x_percent_list(self.decision_function(X), self.seq_ano_perc)

In [None]:
## %%time
from datetime import datetime



list_dataset = [

    "Helpdesk",
    "BPI_Challenge_2012_A",
    "BPI_Challenge_2012_O",
    "BPI_Challenge_2013_closed_problems",
    "bpi_challenge_2013_incidents"
    ]

final_results_precision=None;final_results_recall=None;final_results_f1=None # Pour les résultats finaux

for file_name in list_dataset:
   
    start_time = datetime.now()
    print(start_time)
    
    print("-------------------------------------------------"+file_name+"-------------------------------------------------")
    data = pd.read_pickle(r'data/%s.pkl'%(file_name))
    
    # display(data)
    
    # we may delete the sequences that are too short or too long
    # data = data[data['Obs_seq'].apply(len) >= 1]
    # data = data[data['Obs_seq'].apply(len) <= 500]
    
    
    # we may troncate right the sequences
    # data['Obs_seq'] = data['Obs_seq'].apply(lambda x: x[-20:])
    # data['Intervals_seq'] = data['Intervals_seq'].apply(lambda x: x[-20:])
    # data['Intervals_seq'] = [[round(val, 2) for val in inner_list] for inner_list in data['Intervals_seq']]
    # print("longeur dataset:"+str(len(data)))
    
    # fin ajout
    data['Entite'] = data.index
    # display(data)
    
    # paramètres par défaut
    perc_rows_2_modif=10
    perc_items_2_modif=50


    scoring = {
               "precision":"precision",
               "f1": "f1",
               "recall": "recall"
              }

    # parameters={'ano_method':['FPOF','WCFPOF','FPOF_clust','WCFPOF_clust']}
    # parameters={'aggreg_method':['min','max','mean']}
    # parameters={'n_clust':[1,2,3,4,5,6,7,8,9,10]}
    # parameters={'min_len_of_frequent_pattern':[2,3,4,5,6]}
    # parameters={'nb_of_frequent_patterns':[0.5,0.6,0.7,0.8,0.9,0.9999]}
    parameters={'nb_of_frequent_patterns':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]}
    

    parameter_tested_col='param_'+list(parameters.keys())[0] # to print at the end
    
    # création instance classe
    MyModel = MyAnomalyDetectionObjectFromPM(nb_of_frequent_patterns=10,
                                        min_len_of_frequent_pattern=3,
                                        n_clust=4,
                                        seq_ano_perc=perc_rows_2_modif, 
                                        item_ano_perc=perc_items_2_modif, 
                                        ano_method='WCFPOF_clust',
                                        algo_clustering="kmeans",
                                        aggreg_method="min")
    # création instance gridsearch
    clf = GridSearchCV(MyModel,
                       cv=StratifiedKFold(shuffle=True,
                                           n_splits=5,
                                           random_state=1),
                       scoring=scoring,
                       error_score="raise",
                       n_jobs=6,
                       refit=False,
                       verbose=0,
                       param_grid=parameters)
    
    # to prepare data
    data_prepared = modify_obs_seq(data,perc_rows_2_modif,perc_items_2_modif)

    clf.fit(data_prepared.loc[:, data_prepared.columns != 'is_ano'], data_prepared['is_ano'])



    df_result_precision=format_results(clf.cv_results_, "mean_test_precision", "std_test_precision")
    df_result_recall=format_results(clf.cv_results_, "mean_test_recall", "std_test_recall")
    df_result_f1=format_results(clf.cv_results_, "mean_test_f1", "std_test_f1")
    
    if final_results_precision is not None:
        final_results_precision = pd.concat([final_results_precision,df_result_precision])
        final_results_recall = pd.concat([final_results_recall,df_result_recall])
        final_results_f1 = pd.concat([final_results_f1,df_result_f1])
    else:
        final_results_precision=df_result_precision
        final_results_recall=df_result_recall
        final_results_f1=df_result_f1

    display(final_results_precision.style.set_caption('Precision_score'))
    display(final_results_recall.style.set_caption('Recall_score'))
    display(final_results_f1.style.set_caption('F1_score'))
    
    end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))