# MGL870 - TP2 - Utilisation de l’apprentissage machine pour la détection des anomalies
## Pierre Joseph, Jonathan Mésidor, Mohamed Fehd Soufi
## Automne 2024


## Import required libraries

In [None]:
import os
import pandas as pd
import sys
sys.path.append('../../')
from logparser.Drain import LogParser
import re
import json
from collections import defaultdict
from tqdm import tqdm

## Split log file into train-valid-test (60-20-20)

In [None]:
import os

def split_logfile(input_file, train_file, valid_file, test_file, train_ratio=0.6, valid_ratio=0.2):
    """
    Splits a file into train, validation, and test sets based on the given ratios.

    Parameters:
    input_file (str): Path to the input file.
    train_file (str): Path to the output train file.
    valid_file (str): Path to the output validation file.
    test_file (str): Path to the output test file.
    train_ratio (float): Ratio of the train set size to the total size (default is 0.6).
    valid_ratio (float): Ratio of the validation set size to the total size (default is 0.2).
    """
    # Read the input file
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Calculate split indices
    train_split_index = int(len(lines) * train_ratio)
    valid_split_index = int(len(lines) * (train_ratio + valid_ratio))

    # Split the lines into train, validation, and test sets
    train_lines = lines[:train_split_index]
    valid_lines = lines[train_split_index:valid_split_index]
    test_lines = lines[valid_split_index:]

    # Write the train set to the train file
    with open(train_file, 'w') as file:
        file.writelines(train_lines)

    # Write the validation set to the validation file
    with open(valid_file, 'w') as file:
        file.writelines(valid_lines)

    # Write the test set to the test file
    with open(test_file, 'w') as file:
        file.writelines(test_lines)

    print(f"Split completed: \n {len(train_lines)} lines in {train_file} \n {len(valid_lines)} lines in {valid_file} \n {len(test_lines)} lines in {test_file}")


In [None]:
split_logfile(input_file='input/HDFS_v1/HDFS.log', 
              train_file='input/HDFS_v1/HDFS_train.log', 
              valid_file='input/HDFS_v1/HDFS_valid.log', 
              test_file='input/HDFS_v1/HDFS_test.log')

#Split completed: 
# 6705377 lines in input/HDFS_v1/HDFS_train.log 
# 2235126 lines in input/HDFS_v1/HDFS_valid.log 
# 2235126 lines in input/HDFS_v1/HDFS_test.log

## Training

### Drain 3 parser on HDFS_train.log, HDFS_valid.log and HDFS_test.log

In [None]:
input_dir  = 'input/HDFS_v1/'  # The input directory of log files
output_dir = 'output/'  # The output directory of parsing results
log_files  = ['HDFS_train.log', 'HDFS_valid.log', 'HDFS_test.log']  # List of input log file names
log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>'  # HDFS log format

# Regular expression list for optional preprocessing (default: [])
regex = [
    r'blk_(|-)[0-9]+',  # block id
    r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)',  # IP
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$',  # Numbers
]
st = 0.5  # Similarity threshold
depth = 4  # Depth of all leaf nodes

parser = LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex)

# Iterate over each log file and parse it
for log_file in log_files:
    parser.parse(log_file)

### Mapping
We created the EventId mapping to have a clearer matrix of events. First we sort the EventId by order of magnitude then we associate a value E(x) = {E0, E1, E2 .... } to each one

In [None]:
def mapping(file_names, output_dir):
    """
    Process multiple log template CSV files and save the mappings to JSON files in a specified output directory.
    
    Parameters:
        file_names (list): A list of CSV file names to process.
        output_dir (str): The directory where output JSON files will be saved.
    """
    # Assure that the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    for file_name in file_names:
        log_templates_file = os.path.join(output_dir, file_name)
        
        # Lire le fichier CSV et créer le dictionnaire de mappage
        log_temp = pd.read_csv(log_templates_file).sort_values(by="Occurrences", ascending=False)
        log_temp_dict = {event: f"E{idx + 1}" for idx, event in enumerate(log_temp["EventId"])}
        
        # Définir le nom du fichier de sortie
        output_file = os.path.join(output_dir, f"{file_name.replace('.csv', '')}.json")
        
        # Sauvegarder le dictionnaire de mappage
        with open(output_file, "w") as f:
            json.dump(log_temp_dict, f)
        
        print(f"Mapping completed and saved to {output_file}")

In [None]:
files = [
    "HDFS_train.log_templates.csv",
    "HDFS_valid.log_templates.csv",
    "HDFS_test.log_templates.csv"
]
output_directory = "./output/"

mapping(files, output_directory)

### Convert the structure of HDFS to have Blk
Adding the columns BlockId, Label and Change the EventId to E1, E2, E3 ... so that we can have a complete structure.

In [None]:
def process_log_files(input_dir, output_dir, json_filename, structured_log_filename, anomaly_label_filename, output_filename):
    """
    Process log files to add BlockId, map EventId values, and label anomalies.
    
    Parameters:
        input_dir (str): The directory containing the input files.
        output_dir (str): The directory for output files.
        json_filename (str): The JSON file with EventId mappings.
        structured_log_filename (str): The CSV file with structured log data.
        anomaly_label_filename (str): The CSV file with anomaly labels.
        output_filename (str): The filename for the processed output.
    """
    # Paths for the input files
    json_file_path = os.path.join(output_dir, json_filename)
    anomaly_label_path = os.path.join(input_dir, anomaly_label_filename)
    structured_log_path = os.path.join(output_dir, structured_log_filename)
    
    # Load the structured log file
    df_structured = pd.read_csv(structured_log_path)

    # Load the JSON mapping file
    with open(json_file_path, 'r') as json_file:
        event_mapping = json.load(json_file)

    # Load the anomaly label file
    df_labels = pd.read_csv(anomaly_label_path)
    df_labels['Label'] = df_labels['Label'].replace({'Normal': 'Success', 'Anomaly': 'Fail'})

    # Add BlockId by extracting block identifiers
    df_structured['BlockId'] = df_structured['Content'].apply(lambda x: re.search(r'blk_(|-)[0-9]+', x).group(0) if re.search(r'blk_(|-)[0-9]+', x) else None)

    # Remove rows where BlockId is NaN
    df_structured = df_structured.dropna(subset=['BlockId'])

    # Map EventId using the JSON file
    df_structured['EventId'] = df_structured['EventId'].apply(lambda x: event_mapping.get(x, x))

    # Merge DataFrames to add the Label column
    df_structured = pd.merge(df_structured, df_labels, on='BlockId', how='left')

    # Reorder columns so BlockId and Label are first
    columns = ['BlockId', 'Label'] + [col for col in df_structured.columns if col not in ['BlockId', 'Label']]
    df_structured = df_structured[columns]

    # Save the processed structured log file
    output_path = os.path.join(output_filename)
    df_structured.to_csv(output_path, index=False)

    print(f"Le fichier structuré avec BlockId et les EventId remplacés est généré et sauvegardé dans {output_path}")



In [None]:
# Train
process_log_files(
    input_dir='./input/HDFS_v1/',
    output_dir='./output',
    json_filename='HDFS_train.log_templates.json',
    structured_log_filename='HDFS_train.log_structured.csv',
    anomaly_label_filename='preprocessed/anomaly_label.csv',
    output_filename='HDFS_train.log_structured_blk.csv'
)

In [None]:
# Valid
process_log_files(
    input_dir='./input/HDFS_v1/',
    output_dir='./output',
    json_filename='HDFS_valid.log_templates.json',
    structured_log_filename='HDFS_valid.log_structured.csv',
    anomaly_label_filename='preprocessed/anomaly_label.csv',
    output_filename='HDFS_valid.log_structured_blk.csv'
)

In [None]:
# Test
process_log_files(
    input_dir='./input/HDFS_v1/',
    output_dir='./output',
    json_filename='HDFS_test.log_templates.json',
    structured_log_filename='HDFS_test.log_structured.csv',
    anomaly_label_filename='preprocessed/anomaly_label.csv',
    output_filename='HDFS_test.log_structured_blk.csv'
)

### Sample

In [None]:
def hdfs_sampling(file_names, input_dir, output_dir, window='session', window_size=0):
    """
    Perform HDFS sampling on multiple structured log files.

    Parameters:
        file_names (list): A list of file names to process.
        input_dir (str): The directory containing the input structured log files.
        output_dir (str): The directory where output files will be saved.
        window (str): The type of windowing (default is 'session').
        window_size (int): The size of the window (not used for session windowing).
    """
    assert window == 'session', "Only window=session is supported for HDFS dataset."
    os.makedirs(output_dir, exist_ok=True)

    for file_name in file_names:
        input_path = os.path.join(input_dir, file_name)
        output_path = os.path.join(output_dir, file_name.replace('.csv', '_sequence.csv'))

        print("Loading", input_path)

        # Load the structured log file with optimized data types for large files
        struct_log = pd.read_csv(input_path, engine='c', na_filter=False, memory_map=True, dtype={'Time': str})
        
        # Standardize the Time format by adding leading zeros for HHMMSS format
        struct_log['Time'] = struct_log['Time'].str.zfill(6)

        # Pad Date values with zeros to ensure uniform length of 6
        struct_log['Date'] = struct_log['Date'].astype(str).str.zfill(6)
        
        # Extract BlockId, fill EventId missing values, and map Label to binary
        struct_log['BlockId'] = struct_log['Content'].str.extract(r'(blk_-?\d+)')
        struct_log['EventId'] = struct_log['EventId'].fillna('')
        struct_log['Label'] = struct_log['Label'].apply(lambda x: 1 if x == 'Fail' else 0)

        # Initialize dictionaries to store results
        data_dict = defaultdict(list)
        time_dict = defaultdict(list)
        date_dict = defaultdict(list)
        type_count = defaultdict(int)

        # Group by BlockId for bulk operations
        grouped = struct_log.groupby('BlockId')
        for block_id, group in tqdm(grouped, total=len(grouped)):
            data_dict[block_id] = group['EventId'].tolist()
            time_dict[block_id] = pd.to_datetime(group['Time'], format='%H%M%S', errors='coerce').dropna()
            date_dict[block_id] = group['Date'].tolist()
            type_count[block_id] = group['Label'].sum()  # Count occurrences of "Fail"

        # Build the final DataFrame
        rows = []
        for block_id, events in tqdm(data_dict.items(), total=len(data_dict)):
            features = [event for event in events if event]

            times = time_dict[block_id]
            dates = date_dict[block_id]
            if len(times) > 1:
                time_intervals = [(times.iloc[i] - times.iloc[i - 1]).total_seconds() for i in range(1, len(times))]
                latency = (times.iloc[-1] - times.iloc[0]).total_seconds()
            else:
                time_intervals = []
                latency = 0

            label = 'Fail' if type_count[block_id] > 0 else 'Success'

            # Use the first occurrence of Date and Time as representative for each BlockId
            first_date = dates[0] if dates else ''
            first_time = times.iloc[0].strftime('%H%M%S') if not times.empty else ''

            rows.append({
                "BlockId": block_id,
                "Label": label,
                "Type": type_count[block_id],
                "Features": str(features),
                "Date": first_date,
                "Time": first_time,
                "TimeInterval": str(time_intervals),
                "Latency": latency
            })

        data_df = pd.DataFrame(rows, columns=['BlockId', 'Label', 'Type', 'Features', 'Date', 'Time', 'TimeInterval', 'Latency'])
        data_df.to_csv(output_path, index=False)
        print(f"HDFS sampling completed. Output saved to {output_path}")

In [None]:
files = [
    'HDFS_train.log_structured_blk.csv',
    'HDFS_valid.log_structured_blk.csv',
    'HDFS_test.log_structured_blk.csv'
]
hdfs_sampling(files, input_dir='output/', output_dir='output/')

### Event Occurence Matrix

In [None]:
def generate_event_occurrence_matrix(log_files, event_traces_files, input_dir, output_dir, event_columns=None):
    """
    Generate event occurrence matrices for multiple log files.

    Parameters:
        log_files (list): A list of log file names (e.g., ['HDFS_train.log', 'HDFS_valid.log']).
        event_traces_files (list): A list of corresponding event trace CSV files (e.g., ['output/HDFS_train.log_structured_blk_sequence.csv']).
        input_dir (str): The directory where the log files and other inputs are located.
        output_dir (str): The directory where the output CSV files should be saved.
        event_columns (list): List of event columns (e.g., ['E1', 'E2', ..., 'E29']). If None, defaults to ['E1' to 'E29'].
    """
    if event_columns is None:
        event_columns = [f"E{i}" for i in range(1, 30)]  # Default event columns (E1 to E29)

    # Path to the anomaly label file
    anomaly_label_file = os.path.join(input_dir, "preprocessed/anomaly_label.csv")

    # Load anomaly labels and map them (only once)
    anomaly_labels = pd.read_csv(anomaly_label_file)
    anomaly_labels['Label'] = anomaly_labels['Label'].apply(lambda x: 'Fail' if x == 'Anomaly' else 'Success')
    label_dict = anomaly_labels.set_index('BlockId')['Label'].to_dict()

    # Iterate over each log file and corresponding event trace file
    for log_file, event_traces_file in zip(log_files, event_traces_files):
        output_file = os.path.join(output_dir, f"Event_occurence_matrix_{log_file.replace('.log', '')}.csv")
        print(f"Processing {log_file}...")

        # Load the corresponding event traces for the current log file
        event_traces = pd.read_csv(event_traces_file)

        # Initialize occurrence matrix for each log file
        occurrence_matrix = []

        # Iterate over each row to build the occurrence matrix
        for _, row in event_traces.iterrows():
            block_id = row['BlockId']
            label = label_dict.get(block_id, 'Unknown')
            features = row['Features']
            event_list = re.findall(r"E\d+", features)

            # Count occurrences of each event
            event_counts = {event: event_list.count(event) for event in event_columns}
            
            # Add each entry to the occurrence matrix with Time and Date
            occurrence_matrix.append({
                "BlockId": block_id,
                "Label": label,
                "Type": int(row['Type']) if pd.notna(row['Type']) else 0,  # Ensure integer for Type
                "Time": row['Time'] if 'Time' in row else '',
                "Date": row['Date'] if 'Date' in row else '',
                **event_counts
            })

        # Convert to DataFrame and save
        occurrence_matrix_df = pd.DataFrame(occurrence_matrix)
        occurrence_matrix_df = occurrence_matrix_df[['BlockId', 'Label', 'Type', 'Time', 'Date'] + event_columns]
        occurrence_matrix_df.to_csv(output_file, index=False)
        print(f"Event occurrence matrix for {log_file} saved to {output_file}")



In [None]:
log_files = ['HDFS_train.log', 'HDFS_valid.log', 'HDFS_test.log']
event_traces_files = [
    'output/HDFS_train.log_structured_blk_sequence.csv',
    'output/HDFS_valid.log_structured_blk_sequence.csv',
    'output/HDFS_test.log_structured_blk_sequence.csv'
]
input_dir = './input/HDFS_v1/'
output_dir = './output/'

generate_event_occurrence_matrix(log_files, event_traces_files, input_dir, output_dir)

### Converting Date and Time variables to Datetime

In [3]:
import pandas as pd

# Convert Time column (e.g., 20106 to 20:10:06)
train_data['Time'] = train_data['Time'].apply(lambda x: f"{x:06d}")  # Ensure it's 6 digits
train_data['Time'] = pd.to_datetime(train_data['Time'], format='%H%M%S').dt.time

# Convert Date column (e.g., 81110 to 2010-08-11)
train_data['Date'] = train_data['Date'].apply(lambda x: f"{x:05d}")  # Ensure it's 5 digits
train_data['Date'] = pd.to_datetime(train_data['Date'], format='%m%d%y')

# Save the modified data to a new CSV file
train_data.to_csv('output/Event_occurence_matrix_HDFS_train_updated.csv', index=False)

print(train_data[['Date', 'Time']].head())

        Date      Time
0 2010-08-11  02:01:06
1 2010-08-11  01:51:17
2 2010-08-11  22:14:31
3 2010-08-11  01:24:44
4 2009-08-11  21:15:24
