In [None]:
# to transform XES to PKL
import os
import glob
import pandas as pd
# from dateutil.parser import parse
from tqdm import tqdm
import pm4py
from collections import Counter


folder_path = 'data'  # Replace with the path to your folder

# Use glob to get a list of all .xes files in the folder
xes_files = glob.glob(os.path.join(folder_path, '*.xes'))

# Print the list of .xes files
for xes_file in xes_files:
    file_name = os.path.basename(xes_file)
    print(file_name)
    

    
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

do_display=True

def build_entity_sequence(df_input,entity_colname,obs_colname,time_colname,method_time):
    """Crée un jeu de données de séquence, a la maille entité-> passage de la maille "obs/entité" a la maille "entité"
    Parameters
    ----------
    df : dataframe
        La dataframe utilisée
        
    Returns
    ----------
    df_seq : dataframe
        jeu de données de séquences à la maille entité
    """ 
    
    def create_obs_sequence_for_entity(df_indiv): # crée la séquence d'événements pour l'entité
        seq_entite = []
        for j in range(len(df_indiv)):
            seq_entite.append(df_indiv.loc[j, obs_colname])
        return seq_entite

    def create_interval_sequence_for_entity(df_indiv): # utilise les intervalles de temps dans la séquence timestamps
        interv_seq_entite = []
        for j in range(len(df_indiv)):
            if j==len(df_indiv)-1:
                interv_seq_entite.append(0.0)
            else:
                interv_seq_entite.append(df_indiv.loc[j+1, time_colname]-df_indiv.loc[j, time_colname])
        return interv_seq_entite
    
    def create_timestamp_sequence_for_entity(df_indiv): # utilise les timestamps dans la séquence timestamps
        interv_seq_entite = []
        for index, row in df_indiv.iterrows():
            interv_seq_entite.append(row["date"])
        return interv_seq_entite
    def create_empty_time_sequence_for_entity(df_indiv): # remplace la séquence de timestamps par des 0
        interv_seq_entite = []
        for index, row in df_indiv.iterrows():
            interv_seq_entite.append(0)
        return interv_seq_entite
    df=df_input.copy()
    min_date = df['date'].min()
    df['date']=(df.date-min_date).dt.total_seconds()/3600 # timestamp to number of seconds since first obs
    
    dicCol = {x:y for x,y in df.groupby(entity_colname)}
    
    # filter short sequences
    min_length = 1
    dicCol = {key: df for key, df in dicCol.items() if len(df) >= min_length} 
    # print(len(dicCol))
    
    list_seq = []
    columns = ['Entite', 'Obs_seq', 'Intervals_seq']
    
    for entite, df_entite in tqdm(dicCol.items()): 
        
        df_entite = df_entite.sort_values(by=['date'], ignore_index=True)
        
        seq_entite = create_obs_sequence_for_entity(df_entite)
        
        if method_time=="timestamp":
            seq_entite_time = create_timestamp_sequence_for_entity(df_entite)
            
        elif method_time=="interval":
            seq_entite_time = create_interval_sequence_for_entity(df_entite)
            
        elif method_time=="no_timestamp":
            seq_entite_time = create_empty_time_sequence_for_entity(df_entite)
            
        else:
            return None
        
        list_seq.append([entite, seq_entite, seq_entite_time])

    # Créez le DataFrame une fois que toutes les données sont prêtes
    df_seq_entite = pd.DataFrame(list_seq, columns=columns)
    return df_seq_entite

def get_variable_frequency_stats(data,col_name):
    min_date=data['date'].min()
    datax = data[col_name].value_counts().sort_index()
    datay = pd.DataFrame({
      'state': datax.index,
      'Frequency': datax.values,
      'Percent': ((datax.values/datax.values.sum())*100).round(2),
      'Cumulative Frequency': datax.values.cumsum(),
      'Cumulative Percent': ((datax.values.cumsum()/datax.values.sum())*100)\
    .round(2)
    })
    display(datay.sort_values(by=['Frequency']))
    return None



do_display=True
list_dataset = [
    
    "env_permit",
    "Helpdesk",
    "nasa",
    "SEPSIS",
    "BPI_Challenge_2012",
    "BPI_Challenge_2012_A",
    "BPI_Challenge_2012_Complete",
    "BPI_Challenge_2012_O",
    "BPI_Challenge_2012_W",
    "BPI_Challenge_2012_W_Complete",
    "BPI_Challenge_2013_closed_problems",
    "bpi_challenge_2013_incidents",
    "BPI Challenge 2017",
    "BPI_Challenge_2019",

    # "BPIC15_1",
#     "BPIC15_2",
#     "BPIC15_3",
#     "BPIC15_4",
#     "BPIC15_5",
#     "Hospital_log"
]
list_dataset = ["DATA_MCF_1an_prest"]

for file_name in list_dataset: 
    print("======================================")
    print("-------- Dataset: "+file_name)
    print("======================================")
    data = pm4py.read_xes('data/%s.xes'%(file_name))
    data = pm4py.convert_to_dataframe(data)[["case:concept:name","time:timestamp","concept:name"]]
    data.rename(columns={'time:timestamp':'date'}, inplace=True)
    data.rename(columns={'case:concept:name':'entity'}, inplace=True)
    data.rename(columns={'concept:name':'type'}, inplace=True)
    
    if do_display: display(data.head(10))
    data=data[data['entity'].notnull()]
    data["date"] = pd.to_datetime(data["date"])
    data['entity'] = pd.factorize(data['entity'])[0]
    data['type'] = pd.factorize(data['type'])[0]
    
    if do_display: print('df_entity_seq_interval')
    df_entity_seq_interval     = build_entity_sequence(data,'entity',"type","date",method_time="interval") # appel de la fonction ci-dessus avec le param_tre "intervalles"
    if do_display: display(df_entity_seq_interval.head(10))
    df_entity_seq_interval.to_pickle("data/%s.pkl"%(file_name))
    
 