In [922]:
!pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [923]:
import re
import pandas as pd
from collections import defaultdict

def get_transaction_count(line):
    pattern = r"transactions = \{([^}]+)\}"
    match = re.search(pattern, line)
    if match:
        return set(re.findall(r"Transaction ([a-fA-F0-9]+)", match.group(1)))
    return set()

def get_timestamp(line):
    pattern = r"^\d+\.\d+"
    match = re.match(pattern, line)
    return float(match.group(0)) if match else None

def get_node_name(line):
    pattern = r"Node ([A-Za-z0-9]+)"
    match = re.search(pattern, line)
    return match.group(1) if match else None

def count_unique_mempool_transactions(file_path, node_number):
    unique_transactions = set()
    with open(file_path, 'r') as file:
        for line in file:
            if f"Node {node_number}" in line and "from mempool" in line:
                unique_transactions.update(re.findall(r"Transaction ([a-fA-F0-9]+)", line))
    return len(unique_transactions)

def extract_slot(message_line):
    """
    Extracts the slot number from an externalize message log line.
    Looks for the pattern "slot <number>".
    """
    pattern = r"slot (\d+)"
    match = re.search(pattern, message_line)
    if match:
        return int(match.group(1))
    return None

def process_log_lines(file_path):
    """
    Extracts **all** SCPExternalize messages per node and stores their relevant details.
    A new column 'Slot' is added by parsing the slot number from the log message.
    """
    data = []

    with open(file_path, 'r') as file:
        for line in file:
            # Only consider lines that contain relevant externalization messages
            if ("appended SCPExternalize message for slot" not in line and
                "adopting externalized value for slot" not in line):
                continue
            
            node_name = get_node_name(line)
            timestamp = get_timestamp(line)
            transactions = get_transaction_count(line)
            slot = extract_slot(line)  # Extract slot directly from the log line

            if node_name:
                data.append({
                    "node name": node_name,
                    "Timestamp of finalisation": timestamp,
                    "Finalised transactions": transactions,
                    "Externalize message": line.strip(),
                    "Slot": slot
                })
    
    # Convert the collected data to a DataFrame
    df = pd.DataFrame(data)
    
    # If any row didn't have a slot parsed, you might want to drop or handle it:
    df = df.dropna(subset=["Slot"])

    # Count the number of finalized transactions for each externalize message
    df["No. of finalised transactions"] = df["Finalised transactions"].apply(len)
    
    # Compute total transactions for each node from mempool logs
    df["total_transactions"] = df["node name"].apply(lambda node: count_unique_mempool_transactions(file_path, node))
    
    # Calculate number of transactions not finalized for each node
    df["no. of transactions not finalised"] = df["total_transactions"] - df["No. of finalised transactions"]

    return df

# Example usage:
# file_path = 'path/to/your/simulator_events_log.txt'
# df = process_log_lines(file_path)
# print(df)


In [924]:
file_path = 'src/simulator_events_log.txt'

df = process_log_lines(file_path)


# df_sorted = df.sort_values(by='Timestamp of finalisation', ascending=True)

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Prevent line wrapping
pd.set_option('display.colheader_justify', 'center')  # Center column headers
pd.set_option('display.float_format', '{:.4f}'.format)  # Format float values

df = df.sort_values(by='Slot', ascending=True)

display(df) 

Unnamed: 0,node name,Timestamp of finalisation,Finalised transactions,Externalize message,Slot,No. of finalised transactions,total_transactions,no. of transactions not finalised
0,37,11.18,{8f11e01a},11.18 - NODE - INFO - Node 37 appended SCPExte...,1,1,31,30
32,13,11.69,{8f11e01a},11.69 - Node 13 adopting externalized value f...,1,1,41,40
33,23,11.69,{8f11e01a},11.69 - Node 23 adopting externalized value f...,1,1,38,37
34,40,11.69,{8f11e01a},11.69 - Node 40 adopting externalized value f...,1,1,36,35
35,25,11.7,{8f11e01a},11.70 - Node 25 adopting externalized value f...,1,1,40,39
36,7,11.7,{8f11e01a},11.70 - Node 7 adopting externalized value fo...,1,1,40,39
37,52,11.71,{8f11e01a},11.71 - Node 52 adopting externalized value f...,1,1,42,41
38,6,11.72,{8f11e01a},11.72 - Node 6 adopting externalized value fo...,1,1,36,35
39,29,11.73,{8f11e01a},11.73 - Node 29 adopting externalized value f...,1,1,30,29
40,36,11.73,{8f11e01a},11.73 - Node 36 adopting externalized value f...,1,1,29,28


## Analyze Transaction Matches across slots

In [925]:
def analyze_transaction_matches(df):
    tx_occurrences = {}

    for idx, row in df.iterrows():
        node = row['node name']
        slot = row['Slot']
        # row['Finalised transactions'] is a set; iterate through each transaction hash
        for tx in row['Finalised transactions']:
            if tx not in tx_occurrences:
                tx_occurrences[tx] = set()
            tx_occurrences[tx].add((node, slot))
    
    duplicates = {tx: occ for tx, occ in tx_occurrences.items() if len(occ) > 1} 

    return duplicates


duplicates = analyze_transaction_matches(df)
print("Transactions that appear in more than one slot:")
for tx, occ in duplicates.items():
    print(f"Transaction {tx} appears in: {sorted(list(occ))}")


Transactions that appear in more than one slot:
Transaction 8f11e01a appears in: [('0', 1), ('1', 1), ('10', 1), ('11', 1), ('12', 1), ('13', 1), ('14', 1), ('15', 1), ('16', 1), ('17', 1), ('18', 1), ('19', 1), ('2', 1), ('20', 1), ('21', 1), ('22', 1), ('23', 1), ('24', 1), ('25', 1), ('26', 1), ('27', 1), ('28', 1), ('29', 1), ('3', 1), ('30', 1), ('31', 1), ('32', 1), ('33', 1), ('34', 1), ('35', 1), ('36', 1), ('37', 1), ('38', 1), ('39', 1), ('4', 1), ('40', 1), ('41', 1), ('42', 1), ('43', 1), ('44', 1), ('45', 1), ('46', 1), ('47', 1), ('48', 1), ('49', 1), ('5', 1), ('50', 1), ('51', 1), ('52', 1), ('53', 1), ('54', 1), ('55', 1), ('56', 1), ('57', 1), ('58', 1), ('59', 1), ('6', 1), ('7', 1), ('8', 1), ('9', 1)]
Transaction 1c85e40 appears in: [('0', 2), ('1', 2), ('10', 2), ('11', 2), ('12', 2), ('13', 2), ('14', 2), ('15', 2), ('16', 2), ('17', 2), ('18', 2), ('19', 2), ('2', 2), ('20', 2), ('21', 2), ('22', 2), ('23', 2), ('24', 2), ('25', 2), ('26', 2), ('27', 2), ('28', 

#### SHOW ALL TRANSACTIONS THAT APPEAR IN MULTIPLE SLOTS FOR THE SAME NODE

In [926]:
def analyze_node_slots_and_multislot_occurrences(df):
    """
    Analyzes the DataFrame to extract:
      - For each node, a dictionary mapping each Slot to the set of finalized transaction hashes 
        that were reported in that slot.
      - For each node, a dictionary mapping transaction hashes (that occur in multiple slots) to the
        set of slot numbers where they appear.
    
    Args:
        df (pandas.DataFrame): A DataFrame with at least the columns:
            - 'node name'
            - 'Slot'
            - 'Finalised transactions' (a set of transaction hashes)
    
    Returns:
        A tuple (node_slots, multi_occurrences) where:
          - node_slots is a dict: { node_name: { slot: set(transaction hashes), ... }, ... }
          - multi_occurrences is a dict: { node_name: { transaction_hash: set(slots) where it appears, ... }, ... }
    """
    # Build a dictionary that records for each node which transactions are finalized in each slot.
    node_slots = {}
    for idx, row in df.iterrows():
        node = row['node name']
        slot = row['Slot']
        txs = row['Finalised transactions']  # expected to be a set of transaction hashes
        if node not in node_slots:
            node_slots[node] = {}
        if slot not in node_slots[node]:
            node_slots[node][slot] = set()
        node_slots[node][slot].update(txs)
    
    # Now, for each node, gather transaction occurrences across slots.
    multi_occurrences = {}
    for node, slot_dict in node_slots.items():
        tx_occurrences = {}  # maps tx -> set of slots where it occurs
        for slot, txs in slot_dict.items():
            for tx in txs:
                if tx not in tx_occurrences:
                    tx_occurrences[tx] = set()
                tx_occurrences[tx].add(slot)
        # Filter to only transactions that appear in more than one slot.
        multi = {tx: slots for tx, slots in tx_occurrences.items() if len(slots) > 1}
        if multi:
            multi_occurrences[node] = multi

    return node_slots, multi_occurrences

# Example usage:
node_slots, multi_occurrences = analyze_node_slots_and_multislot_occurrences(df)

print("Per-node transaction sets by slot:")
for node, slot_info in node_slots.items():
    print(f"Node {node}:")
    for slot, txs in sorted(slot_info.items()):
        print(f"  Slot {slot}: {sorted(list(txs))}")

print("\nTransactions that appear for the same node in multiple slots:")
for node, tx_info in multi_occurrences.items():
    print(f"Node {node}:")
    for tx, slots in tx_info.items():
        print(f"  Transaction {tx} appears in slots: {sorted(list(slots))}")


Per-node transaction sets by slot:
Node 37:
  Slot 1: ['8f11e01a']
  Slot 2: ['1223dcd3', '1c85e40', '20855dfc', '30f76354', '369339f0', '37e122c7', '39b76bd0', '39b9fdf8', '3bc93dfd', '429089ba', '45e8c2a1', '47dde2a1', '4a505529', '4ecfe7f9', '5388e912', '5800f46d', '5c0f598a', '621883ce', '69a63c57', '6bcb57d9', '6bf9f3d4', '6d187969', '6fa86be8', '72fcc5f2', '7a484513', '7b410014', '802b31f8', '969ef920', '9c2523c1', 'a2135f09', 'b005578d', 'b1b6d244', 'b5b26135', 'b8faa305', 'ba30b0c8', 'ba8d2548', 'c701e2a1', 'c789e0e4', 'ccaf3d81', 'cd59e482', 'd3b98430', 'e48db79c', 'e6058455', 'ecc83c05', 'efa93def', 'f72e3986']
  Slot 3: ['b638aac4']
  Slot 4: ['1ad8bd55', '1e65da5', '1f0faaa0', '29a9f477', '2cb294eb', '31d72be1', '37c80700', '383659fe', '3caed2ef', '3ef8814a', '4182d1b7', '4466c4b1', '4550aa92', '463a00c4', '562b46c', '5aefc6d', '64a74882', '674a0bae', '6cf5ded4', '6e807701', '6f9b7972', '73533f58', '78ec9a4f', '83c57893', '85caf54', '86b09374', '86c62fb5', '8d5d8b55', '911b

In [927]:
import pandas as pd

def analyze_transaction_matches_with_msgs(df):
    """
    Find transactions that show up in more than one slot *and* record
    the first 3 words of the log messages that caused each appearance.
    
    Returns a dict:
      tx_hash -> {
         'occurrences': set of (node, slot),
         'msg_types':   set of first‑3‑word summaries
      }
    """
    tx_occurrences = {}   # tx -> set((node,slot))
    tx_msgtypes   = {}    # tx -> set(msg_type)
    
    for _, row in df.iterrows():
        node    = row['node name']
        slot    = row['Slot']
        txs     = row['Finalised transactions']
        msg     = row['Externalize message']
        # grab the first 3 words of the log line
        msg_type = " ".join(msg.split()[:10])
        
        for tx in txs:
            tx_occurrences.setdefault(tx, set()).add((node, slot))
            tx_msgtypes.setdefault(tx,   set()).add(msg_type)
    
    # now filter to only those that appear in >1 slot
    duplicates = {}
    for tx, occ in tx_occurrences.items():
        if len(occ) > 1:
            duplicates[tx] = {
                'occurrences': occ,
                'msg_types':   tx_msgtypes.get(tx, set())
            }
    return duplicates

# --- example usage ---
duplicates = analyze_transaction_matches_with_msgs(df)

print("Transactions that appear in more than one slot, with their first‐3‐word log types:")
for tx, info in duplicates.items():
    occ       = sorted(info['occurrences'])
    msg_types = sorted(info['msg_types'])
    print(f"- {tx}:")
    print(f"    slots:    {occ}")
    print(f"    messages: {msg_types}")


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [928]:
def analyze_node_repeated_txs_with_msgs(df):
    """
    For each node, find transactions that are finalized
    in more than one slot.  For each such tx, record the
    slot *and* the first‑3‑words of the log message that
    reported it in that slot.
    
    Returns a dict:
      node -> {
        tx_hash -> [ (slot, msg_type), ... ]
      }
    """
    # intermediate store: node -> tx -> list of (slot,msg_type)
    temp = {}
    
    for _, row in df.iterrows():
        node = row['node name']
        slot = row['Slot']
        txs  = row['Finalised transactions']
        msg  = row['Externalize message']
        # first three words of the message
        msg_type = " ".join(msg.split()[:50])
        
        # initialize per‐node
        if node not in temp:
            temp[node] = {}
        
        for tx in txs:
            temp[node].setdefault(tx, []).append((slot, msg_type))
    
    # now filter to only those txes that appear in >1 distinct slot for each node
    result = {}
    for node, tx_dict in temp.items():
        repeated = {}
        for tx, slot_msgs in tx_dict.items():
            # collect distinct slots
            slots = { sm[0] for sm in slot_msgs }
            if len(slots) > 1:
                # keep all occurrences (slot,msg_type)
                repeated[tx] = slot_msgs
        if repeated:
            result[node] = repeated
    
    return result

# --- example usage ---
multi = analyze_node_repeated_txs_with_msgs(df)

for node, tx_info in multi.items():
    print(f"\nNode {node}:")
    for tx, occurrences in tx_info.items():
        print(f"  Transaction {tx!r} is in multiple slots:")
        for slot, msg_type in sorted(occurrences):
            print(f"    Slot {slot:>2} via “{msg_type}”")


Analyse duplicates within transactions

In [929]:
def check_duplicates_within_slots(df):
    duplicates_within = []

    for idx, row in df.iterrows():
        node = row['node name']
        slot = row['Slot']
        tx_list = list(row['Finalised transactions'])
        seen = set()
        duplicates = set()

        for tx in tx_list:
            if tx in seen:
                duplicates.add(tx)
            else:
                seen.add(tx)

        if duplicates:
            duplicates_within.append({
                'node': node,
                'slot': slot,
                'duplicates': list(duplicates)
            })

    return duplicates_within

duplicates_within_slots = check_duplicates_within_slots(df)
print("Duplicates within each transaction set per slot:")
for entry in duplicates_within_slots:
    print(f"Node: {entry['node']}, Slot: {entry['slot']}, Duplicates: {entry['duplicates']}")


Duplicates within each transaction set per slot:


ADD INTERLEDGER CHECKS

In [930]:
def calculate_inter_ledger_agreement_time(df):
    df = df.sort_values(by='Timestamp of finalisation')
    time_diffs = df['Timestamp of finalisation'].diff().dropna()
    
    return time_diffs.mean()

avg_time = calculate_inter_ledger_agreement_time(df)
print(f"Average Inter-Ledger Agreement Time: {avg_time}")

Average Inter-Ledger Agreement Time: 0.24613778705636746


In [931]:
final_experiment_df = df[[
    "sequence number",
    "Timestamp of finalisation",
    "No. of finalised transactions",
    "no. of transactions not finalised"
]]

display(final_experiment_df)

KeyError: "['sequence number'] not in index"

In [None]:
avg_difference = (final_experiment_df["no. of transactions not finalised"] - final_experiment_df["No. of finalised transactions"]).mean()

print(f"Average difference: {avg_difference}")

In [None]:
avg_finalised = final_experiment_df["No. of finalised transactions"].mean()
avg_total = (final_experiment_df["No. of finalised transactions"] + 
             final_experiment_df["no. of transactions not finalised"]).mean()

finalised_percentage = (avg_finalised / avg_total) * 100 if avg_total != 0 else 0

print(f"Percentage of finalised transactions vs total: {finalised_percentage:.2f}%")