# MGL870 - TP2 - Utilisation de l’apprentissage machine pour la détection des anomalies
## Pierre Joseph, Jonathan Mésidor, Mohamed Fehd Soufi
## Automne 2024


## Requirements

`pip install jupyter logparser3 drain3`

## Import required libraries

In [None]:
import os
import pandas as pd
import sys
sys.path.append('../../')
from logparser.Drain import LogParser
import re


## HDFS_V1

### Drain 3 parser on HDFS.log

In [20]:
input_dir = './input/HDFS_v1/'  
output_dir = './results' 
log_file = 'HDFS.log' 

log_format = '<date> <Time> <Pid> <Level> <Component>: <Content>'
regex      = [
    r'blk_(|-)[0-9]+' , # block id
    r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers
]

st = 0.5 
depth = 4  

parser = LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex)
parser.parse(log_file)

Parsing file: ./input/HDFS_v1/HDFS.log
Total lines:  11175629
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 

### Convert the structure of HDFS to have Blk

In [21]:
import os
import re

import pandas as pd


csv_directory = './results'


structured_log_path = os.path.join(csv_directory, 'HDFS.log_structured.csv')

df_structured = pd.read_csv(structured_log_path)

df_structured['BlockId'] = df_structured['Content'].apply(lambda x: re.search(r'blk_(|-)[0-9]+', x).group(0) if re.search(r'blk_(|-)[0-9]+', x) else None)
df_structured = df_structured.dropna(subset=['BlockId'])
columns = ['BlockId'] + [col for col in df_structured.columns if col != 'BlockId']
df_structured = df_structured[columns]
structured_log_path_with_blockid = os.path.join(csv_directory, 'HDFS.log_structured_blk.csv')
df_structured.to_csv(structured_log_path_with_blockid, index=False)
print(f"Le fichier structuré avec BlockId est généré et sauvegardé dans {structured_log_path_with_blockid}")

Le fichier structuré avec BlockId est généré et sauvegardé dans ./results/HDFS.log_structured_blk.csv


### Sample

In [22]:
import os 
import re
import numpy as np 
import pandas as pd
from collections import OrderedDict

def hdfs_sampling(log_file, window='session', window_size=0):
    assert window == 'session', "Only window=session is supported for HDFS dataset."
    print("Loading", log_file)
    struct_log = pd.read_csv(log_file, engine='c',
            na_filter=False, memory_map=True)
    data_dict = OrderedDict()
    for idx, row in struct_log.iterrows():
        blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
        blkId_set = set(blkId_list)
        for blk_Id in blkId_set:
            if not blk_Id in data_dict:
                data_dict[blk_Id] = []
            data_dict[blk_Id].append(row['EventId'])
    data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])
    print(data_df.head())
    data_df.to_csv("results/HDFS_sequence.csv",index=None)

hdfs_sampling('results/HDFS.log_structured.csv')

Loading results/HDFS.log_structured.csv
                    BlockId                                      EventSequence
0  blk_-1608999687919862906  [09a53393, 3d91fa85, 09a53393, 09a53393, d38aa...
1   blk_7503483334202473044  [09a53393, 09a53393, 3d91fa85, 09a53393, d38aa...
2  blk_-3544583377289625738  [09a53393, 3d91fa85, 09a53393, 09a53393, d38aa...
3  blk_-9073992586687739851  [09a53393, 3d91fa85, 09a53393, 09a53393, d38aa...
4   blk_7854771516489510256  [09a53393, 09a53393, 3d91fa85, 09a53393, d38aa...


### Event Occurence Matrix

In [None]:
import sys
sys.path.append('../')

import os
import re
import json
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import numpy as np
from logparser import Spell, Drain

# get [log key, delta time] as input for deeplog
input_dir  = './input/HDFS_v1/'
output_dir = './results/'  # The output directory of parsing results
log_file   = "HDFS.log"  # The input log file name
csv_directory = './results/'

log_structured_file = os.path.join(csv_directory, 'HDFS.log_structured_blk.csv')
log_templates_file = os.path.join(csv_directory, 'HDFS.log_templates.csv')
log_sequence_file = os.path.join(csv_directory, 'HDFS_sequence.csv')
blk_label_file = os.path.join(input_dir, "preprocessed/anomaly_label.csv")


def mapping():
    log_temp = pd.read_csv(log_templates_file)
    log_temp.sort_values(by = ["Occurrences"], ascending=False, inplace=True)
    log_temp_dict = {event: idx+1 for idx , event in enumerate(list(log_temp["EventId"])) }
    print(log_temp_dict)
    with open (output_dir + "hdfs_log_templates.json", "w") as f:
        json.dump(log_temp_dict, f)


def hdfs_sampling(log_file, window='session'):
    assert window == 'session', "Only window=session is supported for HDFS dataset."
    print("Loading", log_file)
    df = pd.read_csv(log_file, engine='c',
            na_filter=False, memory_map=True, dtype={'Date':object, "Time": object})

    with open(output_dir + "hdfs_log_templates.json", "r") as f:
        event_num = json.load(f)
    df["EventId"] = df["EventId"].apply(lambda x: event_num.get(x, -1))

    data_dict = defaultdict(list) #preserve insertion order of items
    for idx, row in tqdm(df.iterrows()):
        blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
        blkId_set = set(blkId_list)
        for blk_Id in blkId_set:
            data_dict[blk_Id].append(row["EventId"])

    data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])
    data_df.to_csv(log_sequence_file, index=None)
    print("hdfs sampling done")
    
    
    
    



def generate_train_test(hdfs_sequence_file, n=None, ratio=0.3):
    blk_label_dict = {}
   
    blk_df = pd.read_csv(blk_label_file)
    for _ , row in tqdm(blk_df.iterrows()):
        blk_label_dict[row["BlockId"]] = 1 if row["Label"] == "Anomaly" else 0

    seq = pd.read_csv(hdfs_sequence_file)
    seq["Label"] = seq["BlockId"].apply(lambda x: blk_label_dict.get(x)) #add label to the sequence of each blockid

    normal_seq = seq[seq["Label"] == 0]["EventSequence"]
    normal_seq = normal_seq.sample(frac=1, random_state=20) # shuffle normal data

    abnormal_seq = seq[seq["Label"] == 1]["EventSequence"]
    normal_len, abnormal_len = len(normal_seq), len(abnormal_seq)
    train_len = n if n else int(normal_len * ratio)
    print("normal size {0}, abnormal size {1}, training size {2}".format(normal_len, abnormal_len, train_len))

    train = normal_seq.iloc[:train_len]
    test_normal = normal_seq.iloc[train_len:]
    test_abnormal = abnormal_seq

    df_to_file(train, output_dir + "train")
    df_to_file(test_normal, output_dir + "test_normal")
    df_to_file(test_abnormal, output_dir + "test_abnormal")
    print("generate train test data done")


def df_to_file(df, file_name):
    with open(file_name, 'w') as f:
        for _, row in df.items():
            f.write(' '.join([str(ele) for ele in eval(row)]))
            f.write('\n')


if __name__ == "__main__":   
    mapping()
    hdfs_sampling(log_structured_file)
    generate_train_test(log_sequence_file, n=4855)

{'09a53393': 1, '5d5de21c': 2, 'd38aa58d': 3, 'e3df2680': 4, 'dba996ef': 5, 'd63ef163': 6, '3d91fa85': 7, '626085d5': 8, '81cee340': 9, '32777b38': 10, 'd6b7b743': 11, '73c2ec69': 12, '40651754': 13, '728076ac': 14, '2e68ccc3': 15, 'bcc910df': 16, '0567184d': 17, '8f2bc724': 18, 'c294d20f': 19, '75627efd': 20, '54e5f6b4': 21, '69bca6e5': 22, 'd013b7a3': 23, 'ace40671': 24, 'b15ffff8': 25, 'f266840a': 26, 'f79898ae': 27, '461a1574': 28, '2f85639c': 29, 'f0d1ff15': 30, '506b3c4b': 31, '124068c6': 32, '71cf10b1': 33, '234302e6': 34, 'e024fa48': 35, 'fcd37a6d': 36, '78915d3a': 37, '4610d0f1': 38, '68429f34': 39, 'c61b491b': 40, '2ecc047e': 41, 'ff00cd08': 42, '5c88d71b': 43, '06d16156': 44, '13eb7010': 45, '559305d8': 46, '0f86472a': 47, 'b65fc512': 48}
Loading ./results/HDFS.log_structured_blk.csv


11175629it [39:39, 4696.48it/s]


hdfs sampling done


575061it [01:16, 7553.08it/s]


normal size 558223, abnormal size 16838, training size 4855
generate train test data done
