## Helper Functions

In [84]:
import os
import re
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import pandas as pd

ip_pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'

## Get only defcon changes

In [85]:

# Function to parse the log file and extract only WARNING log entries
def extract_warning_logs(input_path, output_path):
    with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file:
        for line in input_file:
            if "WARNING" in line:
                output_file.write(line)




## Concat timestamps of log files

In [86]:
def get_log_files(directory):
    """
    Retrieves all log files in the specified directory.

    :param directory: Path to the directory to search for log files.
    :return: List of paths to log files found in the directory.
    """
    log_files = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith('.log')]
    return log_files


def concat_timestamps_for_dir(directory_path):
    for log_file in get_log_files(directory_path):
        extract_warning_logs(log_file, log_file.replace('_eval.log', '_defcon_warn.log'))
        if 'trigger' in log_file:
            extract_warning_logs(log_file, log_file.replace('_eval_trigger.log', '_trigger_defcon_warn.log'))

    # Updated Regex to match log entries and extract timestamp including milliseconds
    log_entry_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}).*Increased defcon mode to: defcon_4_monitoring'

    # Function to read the triggerdefcon file and extract relevant log entries
    def extract_defcon_entries(file_path, pattern):
        defcon_entries = []
        with open(file_path, 'r') as file:
            for line in file:
                match = re.search(pattern, line)
                if match:
                    defcon_entries.append(match.group(1))
        return pd.DataFrame(defcon_entries, columns=['Timestamp'])

    # Function to append timestamps from other log files within the same minute
    def append_timestamps_from_files(df, log_file_paths, pattern):
        for file_path in log_file_paths:
            if 'trigger_defcon' in file_path or not '_warn' in file_path:
                continue

            file_timestamps = []
            with open(file_path, 'r') as file:
                for line in file:
                    match = re.search(pattern, line)
                    if match:
                        # Now including milliseconds in the parsing format
                        file_timestamps.append(datetime.strptime(match.group(1), '%Y-%m-%d %H:%M:%S.%f'))

            # For each timestamp in the trigger file, search for entries within the same minute in this file
            search_results = []
            for trigger_timestamp in df['Timestamp']:
                start_window = trigger_timestamp
                end_window = trigger_timestamp + timedelta(minutes=1)
                matching_timestamp = next((ts for ts in file_timestamps if start_window <= ts < end_window), None)
                search_results.append(matching_timestamp)
            
            match = re.search(ip_pattern, file_path)
            # Extract the IP address if a match is found
            ip_address = match.group(0)
            df[ip_address] = search_results

        return df

    # Assuming directory_path is set correctly
    log_file_paths = get_log_files(directory_path)
    trigger_file = next((f for f in log_file_paths if 'trigger_defcon_warn' in f), None)

    if trigger_file:
        # Create the initial DataFrame from the triggerdefcon file
        df_defcon = extract_defcon_entries(trigger_file, log_entry_pattern)
        df_defcon['Timestamp'] = pd.to_datetime(df_defcon['Timestamp'], format='%Y-%m-%d %H:%M:%S.%f')

        # Append timestamps from other log files within the same minute
        # df_defcon_complete = pd.DataFrame()
        df_defcon_complete = append_timestamps_from_files(df_defcon, log_file_paths, log_entry_pattern)

        # Display the head of the complete DataFrame
        return df_defcon_complete
    else:
        print('Trigger file not found')





## Calculate delays

In [87]:
def calculate_longest_delay(df):
    # Initialize a list to store the longest delay for each row
    longest_delays = []

    # Iterate over DataFrame rows
    for index, row in df.iterrows():
        # Extract the first timestamp from the trigger file
        trigger_timestamp = row['Timestamp']

        # Initialize a variable to keep track of the longest delay for the current row
        max_delay = pd.Timedelta(0)

        # Iterate over all other columns to calculate delays
        for col in df.columns[1:]:  # Skipping the first column as it's the trigger timestamp
            if pd.isnull(row[col]):
                continue  # Skip if the timestamp is NaN
            # Calculate the time difference
            current_delay = row[col] - trigger_timestamp
            # Update max_delay if the current delay is longer
            if current_delay > max_delay:
                max_delay = current_delay

        # Append the longest delay for the current row to the list
        longest_delays.append(max_delay)

    # Add the list as a new column to the DataFrame
    df['LongestDelay'] = longest_delays
    return df




In [88]:


# # Assuming df_defcon_complete_with_delays is your DataFrame containing the 'LongestDelay' column
# # First, convert the 'LongestDelay' column to total seconds for plotting
# df_defcon_complete_with_delays['LongestDelaySeconds'] = df_defcon_complete_with_delays[
#     'LongestDelay'].dt.total_seconds()
# 
# # Now, create a boxplot for the 'LongestDelaySeconds' column
# plt.figure(figsize=(10, 6))
# plt.boxplot(df_defcon_complete_with_delays['LongestDelaySeconds'].dropna(), vert=False)
# plt.title('Boxplot of Longest Delays')
# plt.xlabel('Seconds')
# plt.ylabel('Longest Delay')
# plt.grid(True)
# 
# # Show the plot
# plt.show()


In [89]:
def get_exactly_tewnty_rows(df):
    # Assuming df is your DataFrame
    # Check if the last row has any NA values
    if df.iloc[-1].isna().any():
        # Drop the last row if it has any NA values
        df = df.iloc[:-1]

    # Ensure the DataFrame has at least 20 rows
    if len(df) > 20:
        # Keep only the last 20 rows
        df = df.tail(20)
    else:
        print('Error: df has less than 20 rows...')

    return df


# Assume directory_path is correctly defined as needed
node_names = ['ed_02', 'fog_01']

for node_name in node_names:
    directory_path_r1 = os.path.join(node_name, 'r1')
    directory_path_r2 = os.path.join(node_name, 'r2')
    directory_path_r3 = os.path.join(node_name, 'r3')

    df_r1 = concat_timestamps_for_dir(directory_path_r1)
    df_r2 = concat_timestamps_for_dir(directory_path_r2)
    df_r3 = concat_timestamps_for_dir(directory_path_r3)

    df_r1_short = get_exactly_tewnty_rows(df_r1)
    df_r2_short = get_exactly_tewnty_rows(df_r2)
    df_r3_short = get_exactly_tewnty_rows(df_r3)

    df_concat = pd.concat([df_r1_short, df_r2_short, df_r3_short], ignore_index=True)
    
    df_concat_delay = calculate_longest_delay(df_concat)
    break
    

IndexError: single positional indexer is out-of-bounds