processing the output files here,

In [1]:
import os
import pandas as pd
import numpy as np
import re
import json

In [2]:
def check_error_log(lfp0, lfp1, lpf2):
    paths = [lfp0, lfp1, lpf2 ]
    for log_file_path in paths:
            with open(log_file_path, 'r') as file:
                for line in file:
                    if len(line) >= 1 and line[0] == 'E':
                        return line 
    return None


In [3]:
def get_execution_time(lfp0, lfp1, lpf2 ): # ASSIGN FIRST AND LAST TIMESTAMP TO MIN AND MAX AMONG ALL THESE FILES
    paths = [lfp0, lfp1, lpf2 ]
    first_timestamp = None
    last_timestamp = None
    for log_file_path in paths:

        with open(log_file_path, 'r') as file:
            timestamp_pattern = re.compile(r'^[WIE](\d{4}) (\d{2}:\d{2}:\d{2}\.\d+)')
            for line in file:
                match = timestamp_pattern.search(line)
                if match:
                    time_str = match.group(2)
                    try:
                        timestamp = pd.to_datetime(time_str, format='%H:%M:%S.%f')
                        # Update the first and last timestamps
                        if first_timestamp is None:
                            first_timestamp = timestamp
                        if last_timestamp is None:
                            last_timestamp = timestamp
                        #not none
                        if first_timestamp > timestamp:
                            first_timestamp = timestamp #update to minimum
                        if last_timestamp < timestamp:
                            last_timestamp = timestamp #update to mmax
                    except ValueError as ve:
                        # Handle invalid timestamp formats
                        print(f"Skipping invalid timestamp: {timestamp_str} ({ve})")

            # invalid case
            if first_timestamp is None or last_timestamp is None:
                print("No valid timestamps found")
                return pd.Timedelta(0)
            execution_time = last_timestamp - first_timestamp
            return execution_time

In [4]:
def check_liveness(lfp0, lfp1, lpf2):
  paths = [lfp0, lfp1, lpf2]
  search_phrase = "Node becomes leader"
  for log_file_path in paths:
        with open(log_file_path, 'r') as file:
            for line in file:
                if search_phrase.lower() in line.lower():
                        return True
  return False

In [5]:
def check_safety(lfp0, lfp1, lpf2):
  paths = [lfp0, lfp1, lpf2]
  search_phrase = r"term (\d+) become leader of group" # Regex to match 'term 1', 'term 2', etc.

  terms = {}
  for path in paths:
      with open(path, 'r') as file:
          content = file.read()
          terms[path] = set(re.findall(search_phrase, content))

  # if termsA ^ termsB =! null -> TRUE
  for i, path1 in enumerate(paths):
      for j, path2 in enumerate(paths):
          if i != j:
              if terms[path1] & terms[path2]:  
                  return True
  return False

In [6]:
def parse_schedule_log(log_file_path):
    schedule_data = []
    with open(log_file_path, 'r') as file:
        for line in file:
            match_message= re.match(r"\{(\d+) (\d+) (\w+) SenderVC: (.+) ReceiverVC: (.+)\}", line.strip())
            match_drop= re.match(r"\{(\d+) (\d+) (\w+) is dropped SenderVC: (.+) ReceiverVC: (.+)\}", line.strip())

            if match_message:
                entry = {
                    'drop': 0,
                    'sender': int(match_message.group(1)),
                    'receiver': int(match_message.group(2)),
                    'action': match_message.group(3),
                    'sender_vc': match_message.group(4),
                    'receiver_vc': match_message.group(5)
                }
                schedule_data.append(entry)

            if match_drop:
                entry = {
                    'drop': 1,
                    'sender': int(match_drop.group(1)),
                    'receiver': int(match_drop.group(2)),
                    'action': match_drop.group(3),
                    'sender_vc': match_drop.group(4),
                    'receiver_vc': match_drop.group(5)
                }
                schedule_data.append(entry)
    return schedule_data

In [27]:

benchmarks = ["b0", "b1", "b2"]
schedulers = ["pct1",  "pct2", "pos", "posc", "random"]
dmFaults = ["f0", "f4"]

data = []

# Loop through each combination of benchmark, scheduler, and fault condition
for benchmark in benchmarks:
    for scheduler in schedulers:
        for faulter in dmFaults:
          
          
            for i in range(0, 100):
                if scheduler == "pct1" or scheduler ==  "pct2":
                    scheduler2 = scheduler [0:-1]
                else:
                    scheduler2 = scheduler
                experiment_folder = os.path.join(f"{benchmark}-out", f"{benchmark}-{faulter}-{scheduler}", f"test_{scheduler2}_{i}")
                
               
                if not os.path.exists(experiment_folder):
                    print(f"Skipping non-existent folder: {experiment_folder}")
                    continue
                
                schedule_path = os.path.join(experiment_folder, "schedule.log")
                stderr0_path = os.path.join(experiment_folder, "stderr_0.log")
                stderr1_path = os.path.join(experiment_folder, "stderr_1.log")
                stderr2_path = os.path.join(experiment_folder, "stderr_2.log")
            
                parsed_schedule = parse_schedule_log(schedule_path)
                errorLog = check_error_log(stderr0_path, stderr1_path, stderr2_path)
                t_exec = get_execution_time(stderr0_path, stderr1_path, stderr2_path)
                livenessCheck = check_liveness(stderr0_path, stderr1_path, stderr2_path)
                safetyCheck = check_safety(stderr0_path, stderr1_path, stderr2_path)

                fault = False
                if faulter == "f4":
                    fault = True
                    
                # Create entry and append to data
                entry = {
                    'benchmark': benchmark,
                    'scheduler': scheduler,
                    'drop message faults': fault,
                    'Error log': errorLog,
                    'liveness': livenessCheck,
                    'safety violation': safetyCheck,
                    'Schedule': parsed_schedule,
                    'Execution_time': t_exec
                }
                data.append(entry)
                print(entry)


{'benchmark': 'b0', 'scheduler': 'pct1', 'drop message faults': False, 'Error log': None, 'liveness': True, 'safety violation': False, 'Schedule': [{'drop': 0, 'sender': 1, 'receiver': 2, 'action': 'pre_vote', 'sender_vc': 'map[0:0 1:1 2:0]', 'receiver_vc': 'map[0:0 1:1 2:1]'}, {'drop': 0, 'sender': 1, 'receiver': 2, 'action': 'request_vote', 'sender_vc': 'map[0:0 1:2 2:0]', 'receiver_vc': 'map[0:0 1:2 2:2]'}, {'drop': 0, 'sender': 1, 'receiver': 0, 'action': 'pre_vote', 'sender_vc': 'map[0:0 1:3 2:0]', 'receiver_vc': 'map[0:1 1:3 2:0]'}, {'drop': 0, 'sender': 1, 'receiver': 0, 'action': 'request_vote', 'sender_vc': 'map[0:0 1:4 2:0]', 'receiver_vc': 'map[0:2 1:4 2:0]'}, {'drop': 0, 'sender': 1, 'receiver': 0, 'action': 'append_entries', 'sender_vc': 'map[0:0 1:5 2:0]', 'receiver_vc': 'map[0:3 1:5 2:0]'}, {'drop': 0, 'sender': 2, 'receiver': 0, 'action': 'pre_vote', 'sender_vc': 'map[0:0 1:2 2:3]', 'receiver_vc': 'map[0:4 1:5 2:3]'}], 'Execution_time': Timedelta('0 days 00:00:16.801700

In [8]:
df = pd.DataFrame(data)
df

Unnamed: 0,benchmark,scheduler,drop message faults,Error log,liveness,safety violation,Schedule,Execution_time
0,b0,pct1,False,,False,False,"[{'drop': 0, 'sender': 1, 'receiver': 2, 'acti...",0 days 00:00:18.355514
1,b0,pct1,False,,False,False,"[{'drop': 0, 'sender': 0, 'receiver': 2, 'acti...",0 days 00:00:17.525810
2,b0,pct1,False,,True,False,"[{'drop': 0, 'sender': 2, 'receiver': 1, 'acti...",0 days 00:00:16.275403
3,b0,pct1,False,E0824 19:12:30.564974 1335 0 /src/braft_b...,True,False,"[{'drop': 0, 'sender': 0, 'receiver': 2, 'acti...",0 days 00:00:17.790498
4,b0,pct1,False,,True,False,"[{'drop': 0, 'sender': 1, 'receiver': 0, 'acti...",0 days 00:00:10.695884
...,...,...,...,...,...,...,...,...
2371,b2,random,True,,True,False,"[{'drop': 0, 'sender': 2, 'receiver': 0, 'acti...",0 days 00:00:17.793412
2372,b2,random,True,,True,False,"[{'drop': 0, 'sender': 2, 'receiver': 1, 'acti...",0 days 00:00:16.812568
2373,b2,random,True,,False,False,"[{'drop': 0, 'sender': 0, 'receiver': 2, 'acti...",0 days 00:00:17.211482
2374,b2,random,True,,True,False,"[{'drop': 0, 'sender': 0, 'receiver': 2, 'acti...",0 days 00:00:16.781371


scheduler | fault condifiton | number of unique schedules | number of unique traces | number unieuq error lines | number of possible liveness bugs | number of safety violations | 

In [28]:
def serialize_schedule(schedule): #this way comparison will be faster (?)
    return json.dumps(schedule)

In [31]:
def get_trace(nodeID, schedule):
    filtered_schedule = [entry for entry in schedule if entry['sender'] == nodeID or entry['receiver'] == nodeID]
    return filtered_schedule

In [35]:

# Initialize dictionary to store statistics
statistics = []

# Aggregate statistics
for benchmark in set(entry['benchmark'] for entry in data):
    for scheduler in set(entry['scheduler'] for entry in data):
        for fault in set(entry['drop message faults'] for entry in data):
            # Filter entries for the current benchmark, scheduler, and fault condition
            filtered_entries = [entry for entry in data 
                                if entry['benchmark'] == benchmark and
                                   entry['scheduler'] == scheduler and
                                   entry['drop message faults'] == fault]

            # If there are no entries for the combination, skip to avoid errors
            if not filtered_entries:
                continue

            filtered_entries_df = pd.DataFrame(filtered_entries)
            serialized_schedules= filtered_entries_df['Schedule'].apply(serialize_schedule)
            unique_schedules = serialized_schedules.unique()
            unique_schedules = len(unique_schedules)
            
            # Number of unique traces (assuming Error log is a list of lines; convert to tuple for hashing)
            #unique_traces = len(set(tuple(entry['Error log']) for entry in scheduler_fault_entries))
            node_ids = [0, 1, 2]
            tracesOfN0= filtered_entries_df['Schedule'].apply(lambda s: get_trace(0,s))
            tracesOfN1= filtered_entries_df['Schedule'].apply(lambda s: get_trace(1,s))
            tracesOfN2= filtered_entries_df['Schedule'].apply(lambda s: get_trace(2,s))
           
            general_traces = tracesOfN0 || tracesOfN1  ||tracesOfN2
   
            traces_ser = general_traces.apply(serialize_schedule)
            unique_traces = len(general_traces.unique())

            # Number of unique error lines (assuming Error log is a single string; convert to set for unique lines)
            error_lines = len(set(line for entry in filtered_entries if entry['Error log'] is not None for line in entry['Error log']))

            # Number of possible liveness bugs (count where liveness is true)
            num_liveness_bugs = (filtered_entries_df['liveness'] == False).sum()
            
            # Number of safety violations
            num_safety_violations = (filtered_entries_df['safety violation'] == True).sum()

            mean_Texec = filtered_entries_df['Execution_time'].mean()
            # Append to statistics list
            statistics.append({
                'benchmark' : benchmark,
                'scheduler': scheduler,
                'fault condition': fault,
                'number of unique schedules': unique_schedules,
                
                'number of unique traces': unique_traces,
                'number of tests with error logged': error_lines,
                'number of possible liveness bugs': num_liveness_bugs,
                'number of safety violations': num_safety_violations,
                'mean execution time' : mean_Texec
            })

# Create DataFrame from statistics
df_statistics = pd.DataFrame(statistics)

# Display DataFrame
df_statistics


TypeError: unhashable type: 'list'