# MGL870 - TP2 - Utilisation de l’apprentissage machine pour la détection des anomalies
## Pierre Joseph, Jonathan Mésidor, Mohamed Fehd Soufi
## Automne 2024


## Import required libraries

In [None]:
import os
import pandas as pd
import sys
sys.path.append('../../')
from logparser.Drain import LogParser
import re


Preparation des donnees -utilisation de Drain3 pour parser

In [1]:
from logparser.Drain import LogParser

input_dir = './BGL/'  
output_dir = './result' 
log_file = 'BGL.log' 

log_format = "<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>"
# regex = [r"core\.\d+"]
regex = [
        r'(0x)[0-9a-fA-F]+', #hexadecimal
        r'\d+.\d+.\d+.\d+',
        # r'/\w+( )$'
        r'\d+'
    ]


st = 0.5  # Similarity threshold
depth = 4  # Depth of all leaf nodes

parser = LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex)
parser.parse(log_file)

Parsing file: ./BGL/BGL.log














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































## Faisons un mapping des EventId en E1 E2... 

In [None]:
# Chemin du répertoire contenant les fichiers CSV
import json
import os
csv_directory = './result'
import pandas as pd
output_dir = "./BGL/output/"
log_templates_file = os.path.join(csv_directory, 'BGL.log_templates.csv')

def mapping():
    log_temp = pd.read_csv(log_templates_file).sort_values(by="Occurrences", ascending=False)
    log_temp_dict = {event: f"E{idx + 1}" for idx, event in enumerate(log_temp["EventId"])}
    
    # Sauvegarde du dictionnaire de mappage
    output_path = os.path.join(output_dir, "BGL.log_templates.json")
    with open(output_path, "w") as f:
        json.dump(log_temp_dict, f)
    print("Mapping completed and saved to", output_path)
    return log_temp_dict
mapping()


Mapping completed and saved to ./BGL/output/hdfs_log_templates.json


{'fc5f0940': 'E1',
 '6265c739': 'E2',
 '1ae4a1e2': 'E3',
 '1b700d02': 'E4',
 '38a7307d': 'E5',
 '65f23e3e': 'E6',
 '3c469b21': 'E7',
 '3aa50e45': 'E8',
 'd2c9db9b': 'E9',
 'cc8ff6de': 'E10',
 'a31b789f': 'E11',
 '11b69b8b': 'E12',
 '57138c88': 'E13',
 '4983ff07': 'E14',
 '1840cbfe': 'E15',
 '64dc455d': 'E16',
 '147cfcff': 'E17',
 '7c363edf': 'E18',
 'c65da9e1': 'E19',
 '16da8291': 'E20',
 '66c627c6': 'E21',
 '9851467f': 'E22',
 'cce08081': 'E23',
 '614cf99e': 'E24',
 'a2335b6b': 'E25',
 '9d5d6232': 'E26',
 'abb57bda': 'E27',
 '22550c73': 'E28',
 '996c9c62': 'E29',
 '130bb340': 'E30',
 'f4a80ec4': 'E31',
 '71785b2a': 'E32',
 '89854e51': 'E33',
 'abff2903': 'E34',
 'c70dad25': 'E35',
 '8d23c697': 'E36',
 'b48b277c': 'E37',
 '41c5149c': 'E38',
 '87710483': 'E39',
 'd8f41a22': 'E40',
 'ea3efa0b': 'E41',
 '40645e82': 'E42',
 '493f6190': 'E43',
 'ce2b6cdc': 'E44',
 'c3c18d52': 'E45',
 'fb1560b4': 'E46',
 'a9208363': 'E47',
 '618cefb8': 'E48',
 '1bff92a7': 'E49',
 '3a77b37e': 'E50',
 '4aa10e1

## Ce que je fais dans structure_bgl.py consiste simplement à utiliser le modèle extrait par drain pour mapper le fichier journal à event_id et extraire l'heure et d'autres informations à utiliser dans la partie suivante.

In [None]:
import os
import re
import pandas as pd
from tqdm import tqdm

# Paramètres et chemins de fichiers
para = {
    "bgl": "./BGL/BGL.log",
    "template": "./result/BGL.log_templates.csv",
    "structured_file": "./result/BGL.log_structured_blk0.csv"
}

def data_read(filepath):
    """Lecture des logs d'origine et conversion en DataFrame."""
    with open(filepath, "r") as fp:
        data = [line.strip("\n").split() for line in tqdm(fp, desc="Reading Logs", unit=" lines")]
    return pd.DataFrame(data)

def match_events(BGL_df):
    """Associe les événements aux templates."""
    template_df = pd.read_csv(para["template"])
    event2id = {row['EventTemplate']: row['EventId'] for _, row in tqdm(template_df.iterrows(), total=template_df.shape[0], desc="Loading Event Templates", unit=" templates")}

    def find_event_id(log_event):
        for event_template, event_id in event2id.items():
            if re.fullmatch(re.escape(event_template), log_event):
                return event_id
        return 'error'

    # Remplace les valeurs None par des chaînes vides avant de joindre
    BGL_df['EventString'] = BGL_df.iloc[:, 9:].fillna("").agg(" ".join, axis=1)
    tqdm.pandas(desc="Matching Events")
    BGL_df['EventId'] = BGL_df['EventString'].progress_apply(find_event_id)
    return BGL_df

def structure_logs(BGL_df):
    """Structure les logs en extrayant l'étiquette, le temps et les événements correspondants."""
    BGL_df['Label'] = BGL_df.iloc[:, 0]
    BGL_df['Time'] = BGL_df.iloc[:, 4]

    # Création du DataFrame final
    structured_df = BGL_df[['Label', 'Time', 'EventId']]
    structured_df = structured_df[structured_df['EventId'] != 'error']

    # Sauvegarde du DataFrame final structuré
    structured_df.to_csv(para["structured_file"], index=False)
    print(f"Le fichier structuré a été sauvegardé dans {para['structured_file']}")

if __name__ == "__main__":
    # Pipeline d'exécution
    BGL_df = data_read(para["bgl"])
    BGL_df = match_events(BGL_df)
    structure_logs(BGL_df)


Matching Events: 100%|██████████| 148/148 [00:25<00:00,  5.74it/s]
Reading Logs:   0%|          | 0/10000 [00:25<?, ? lines/s]


IndexError: single positional indexer is out-of-bounds

## Sampling BGL

In [6]:
import os
import pandas as pd
import numpy as np
para = {"window_size":0.5,"step_size":0.2,"structured_file":"result/BGL.log_structured_blk.csv","BGL_sequence":'result/BGL.log_sequence.csv'}
output_dir = './result' 
structured_log_blk = os.path.join(output_dir, 'BGL.log_structured_blk.csv')

def bgl_sampling(structured_log_path):
    bgl_structured = pd.read_csv(structured_log_path)
    label_data,time_data,event_mapping_data = bgl_structured['Label'].values,bgl_structured['SecondsSince'].values,bgl_structured['EventId'].values
    log_size = len(label_data)
    # split into sliding window
    start_time = time_data[0]
    start_index = 0
    end_index = 0
    start_end_index_list = []
    # get the first start, end index, end time
    for cur_time in time_data:
        if cur_time < start_time + para["window_size"]*3600:
            end_index += 1
            end_time = cur_time
        else:
            start_end_pair = tuple((start_index,end_index))
            start_end_index_list.append(start_end_pair)
            break
    while end_index < log_size:
        start_time = start_time + para["step_size"]*3600
        end_time = end_time + para["step_size"]*3600
        for i in range(start_index,end_index):
            if time_data[i] < start_time:
                i+=1
            else:
                break
        for j in range(end_index, log_size):
            if time_data[j] < end_time:
                j+=1
            else:
                break
        start_index = i
        end_index = j
        start_end_pair = tuple((start_index, end_index))
        start_end_index_list.append(start_end_pair)
    # start_end_index_list is the  window divided by window_size and step_size, 
    # the front is the sequence number of the beginning of the window, 
    # and the end is the sequence number of the end of the window
    inst_number = len(start_end_index_list)
    print('there are %d instances (sliding windows) in this dataset'%inst_number)

    # get all the log indexs in each time window by ranging from start_index to end_index

    expanded_indexes_list=[[] for i in range(inst_number)]
    expanded_event_list=[[] for i in range(inst_number)]

    for i in range(inst_number):
        start_index = start_end_index_list[i][0]
        end_index = start_end_index_list[i][1]
        for l in range(start_index, end_index):
            expanded_indexes_list[i].append(l)
            expanded_event_list[i].append(event_mapping_data[l])
    #=============get labels and event count of each sliding window =========#
    rows = []
    labels = []

    for j in range(inst_number):
        label = 1   #1 represent success, 0 represent failure
        for k in expanded_indexes_list[j]:
            # If one of the sequences is abnormal (0), the sequence is marked as abnormal
            if label_data[k]:
                label = 0
                continue
        labels.append(label)
    assert inst_number == len(labels)
    print("Among all instances, %d are anomalies"%sum(label_data))
    
    rows.append({        
        "Sequence": expanded_event_list,
        "Label": labels,          
        "TimeInterval": time_data,
        "EventId": event_mapping_data
    })

    BGL_sequence = pd.DataFrame(rows, columns=['Sequence','Label', 'TimeInterval'])
    BGL_sequence.to_csv(para["BGL_sequence"],index=None)

if __name__ == "__main__":
    bgl_sampling(structured_log_blk)


there are 18357 instances (sliding windows) in this dataset
Among all instances, 302177 are anomalies
